def test_swallowing_exception(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock(side_effect=NotImplementedError("test")) writer.exceptions_to_ignore = (NotImplementedError, KeyError) with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1))
def test_add_custom_scalars(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_custom_scalars = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_custom_scalars_multilinechart( ["a", "b"], category="cat", title="title" ) with self.assertRaisesRegexp( AssertionError, "Title \(title\) is already in category \(cat\)" ): SummaryWriterContext.add_custom_scalars_multilinechart( ["c", "d"], category="cat", title="title" ) SummaryWriterContext.add_custom_scalars_multilinechart( ["e", "f"], category="cat", title="title2" ) SummaryWriterContext.add_custom_scalars_multilinechart( ["g", "h"], category="cat2", title="title" ) SummaryWriterContext.add_custom_scalars(writer) writer.add_custom_scalars.assert_called_once_with( { "cat": { "title": ["Multiline", ["a", "b"]], "title2": ["Multiline", ["e", "f"]], }, "cat2": {"title": ["Multiline", ["g", "h"]]}, } )
def test_not_swallowing_exception(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock(side_effect=NotImplementedError("test")) with self.assertRaisesRegexp( NotImplementedError, "test" ), summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1))
def test_writing(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1)) writer.add_scalar.assert_called_once_with( "test", torch.ones(1), global_step=0 )
def test_writing(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1)) writer.add_scalar.assert_called_once_with("test", torch.ones(1), global_step=0)
def test_not_swallowing_exception(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock( side_effect=NotImplementedError("test")) with self.assertRaisesRegex( NotImplementedError, "test"), summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1))
def train_network(self, train_dataset, eval_dataset, epochs: int): num_batches = int(len(train_dataset) / self.minibatch_size) logger.info( "Read in batch data set of size {} examples. Data split " "into {} batches of size {}.".format( len(train_dataset), num_batches, self.minibatch_size ) ) start_time = time.time() for epoch in range(epochs): train_dataset.reset_iterator() data_streamer = DataStreamer(train_dataset, pin_memory=self.trainer.use_gpu) preprocess_handler = self.preprocess_handler dtype = self.trainer.dtype def preprocess(batch): tdp = preprocess_handler.preprocess(batch) tdp.set_type(dtype) return tdp feed_pages( data_streamer, len(train_dataset), epoch, self.minibatch_size, self.trainer.use_gpu, TrainingPageHandler(self.trainer), batch_preprocessor=preprocess, ) if hasattr(self.trainer, "q_network_cpe"): # TODO: Add CPE support to DDPG/SAC, Parametric DQN (once moved to modular) eval_dataset.reset_iterator() data_streamer = DataStreamer( eval_dataset, pin_memory=self.trainer.use_gpu ) eval_page_handler = EvaluationPageHandler( self.trainer, self.evaluator, self ) feed_pages( data_streamer, len(eval_dataset), epoch, self.minibatch_size, self.trainer.use_gpu, eval_page_handler, batch_preprocessor=preprocess, ) SummaryWriterContext.increase_global_step() through_put = (len(train_dataset) * epochs) / (time.time() - start_time) logger.info( "Training finished. Processed ~{} examples / s.".format(round(through_put)) )
def _log_histogram_and_mean(self, log_key, val): try: SummaryWriterContext.add_histogram(log_key, val) SummaryWriterContext.add_scalar(f"{log_key}/mean", val.mean()) except ValueError: logger.warning( f"Cannot create histogram for key: {log_key}; " "this is likely because you have NULL value in your input; " f"value: {val}") raise
def log_to_tensorboard(self, epoch: int) -> None: def none_to_zero(x: Optional[float]) -> float: if x is None or math.isnan(x): return 0.0 return x for name, value in [ ("Training/td_loss", self.get_recent_td_loss()), ("Training/reward_loss", self.get_recent_reward_loss()), ("Training/imitator_loss", self.get_recent_imitator_loss()), ]: SummaryWriterContext.add_scalar(name, none_to_zero(value), epoch)
def test_writing_stack(self): with TemporaryDirectory() as tmp_dir1, TemporaryDirectory( ) as tmp_dir2: writer1 = SummaryWriter(tmp_dir1) writer1.add_scalar = MagicMock() writer2 = SummaryWriter(tmp_dir2) writer2.add_scalar = MagicMock() with summary_writer_context(writer1): with summary_writer_context(writer2): SummaryWriterContext.add_scalar("test2", torch.ones(1)) SummaryWriterContext.add_scalar("test1", torch.zeros(1)) writer1.add_scalar.assert_called_once_with("test1", torch.zeros(1)) writer2.add_scalar.assert_called_once_with("test2", torch.ones(1))
def test_minibatches_per_step(self): _epochs = self.epochs self.epochs = 2 rl_parameters = RLParameters(gamma=0.95, target_update_rate=0.9, maxq_learning=True) rainbow_parameters = RainbowDQNParameters(double_q_learning=True, dueling_architecture=False) training_parameters1 = TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=1024, minibatches_per_step=1, learning_rate=0.25, optimizer="ADAM", ) training_parameters2 = TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=128, minibatches_per_step=8, learning_rate=0.25, optimizer="ADAM", ) env1 = Env(self.state_dims, self.action_dims) env2 = Env(self.state_dims, self.action_dims) model_parameters1 = DiscreteActionModelParameters( actions=env1.actions, rl=rl_parameters, rainbow=rainbow_parameters, training=training_parameters1, ) model_parameters2 = DiscreteActionModelParameters( actions=env2.actions, rl=rl_parameters, rainbow=rainbow_parameters, training=training_parameters2, ) # minibatch_size / 8, minibatches_per_step * 8 should give the same result logger.info("Training model 1") trainer1 = self._train(model_parameters1, env1) SummaryWriterContext._reset_globals() logger.info("Training model 2") trainer2 = self._train(model_parameters2, env2) weight1 = trainer1.q_network.fc.layers[-1].weight.detach().numpy() weight2 = trainer2.q_network.fc.layers[-1].weight.detach().numpy() # Due to numerical stability this tolerance has to be fairly high self.assertTrue(np.allclose(weight1, weight2, rtol=0.0, atol=1e-3)) self.epochs = _epochs
def write_summary(self, actions: List[str]): if actions: for field, log_key in [ ("logged_actions", "actions/logged"), ("model_action_idxs", "actions/model"), ]: val = getattr(self, field) if val is None: continue for i, action in enumerate(actions): SummaryWriterContext.add_scalar( "{}/{}".format(log_key, action), (val == i).sum().item()) for field, log_key in [ ("td_loss", "td_loss"), ("imitator_loss", "imitator_loss"), ("reward_loss", "reward_loss"), ("logged_propensities", "propensities/logged"), ("logged_rewards", "reward/logged"), ("logged_values", "value/logged"), ("model_values_on_logged_actions", "value/model_logged_action"), ]: val = getattr(self, field) if val is None: continue assert len(val.shape) == 1 or ( len(val.shape) == 2 and val.shape[1] == 1), "Unexpected shape for {}: {}".format( field, val.shape) self._log_histogram_and_mean(log_key, val) for field, log_key in [ ("model_propensities", "propensities/model"), ("model_rewards", "reward/model"), ("model_values", "value/model"), ]: val = getattr(self, field) if val is None: continue if (len(val.shape) == 1 or (len(val.shape) == 2 and val.shape[1] == 1)) and not actions: self._log_histogram_and_mean(log_key, val) elif len(val.shape) == 2 and val.shape[1] == len(actions): for i, action in enumerate(actions): self._log_histogram_and_mean(f"{log_key}/{action}", val[:, i]) else: raise ValueError( "Unexpected shape for {}: {}; actions: {}".format( field, val.shape, actions))
def add_custom_scalars(action_names: Optional[List[str]]): if not action_names: return SummaryWriterContext.add_custom_scalars_multilinechart( [ "propensities/model/{}/mean".format(action_name) for action_name in action_names ], category="propensities", title="model", ) SummaryWriterContext.add_custom_scalars_multilinechart( [ "propensities/logged/{}/mean".format(action_name) for action_name in action_names ], category="propensities", title="logged", ) SummaryWriterContext.add_custom_scalars_multilinechart( ["actions/logged/{}".format(action_name) for action_name in action_names], category="actions", title="logged", ) SummaryWriterContext.add_custom_scalars_multilinechart( ["actions/model/{}".format(action_name) for action_name in action_names], category="actions", title="model", )
def flush(self): logger.info("Loss on {} batches".format(len(self.incoming_td_loss))) print_details = "Loss:\n" td_loss = torch.tensor(self.incoming_td_loss) SummaryWriterContext.add_histogram("td_loss", td_loss) td_loss_mean = float(td_loss.mean()) SummaryWriterContext.add_scalar("td_loss/mean", td_loss_mean) self.td_loss.append(td_loss_mean) print_details = print_details + "TD LOSS: {0:.3f}\n".format( td_loss_mean) if len(self.incoming_reward_loss) > 0: reward_loss = torch.tensor(self.incoming_reward_loss) SummaryWriterContext.add_histogram("reward_loss", reward_loss) reward_loss_mean = float(reward_loss.mean()) SummaryWriterContext.add_scalar("reward_loss/mean", reward_loss_mean) self.reward_loss.append(reward_loss_mean) print_details = print_details + "REWARD LOSS: {0:.3f}\n".format( reward_loss_mean) for print_detail in print_details.split("\n"): logger.info(print_detail) self.incoming_td_loss.clear() self.incoming_reward_loss.clear()
def forward(self, input): loc, scale_log = self._get_loc_and_scale_log(input.state) r = torch.randn_like(scale_log, device=scale_log.device) action = torch.tanh(loc + r * scale_log.exp()) if not self.training: # ONNX doesn't like reshape either.. return rlt.ActorOutput(action=action) # Since each dim are independent, log-prob is simply sum log_prob = self._log_prob(r, scale_log) squash_correction = self._squash_correction(action) if SummaryWriterContext._global_step % 1000 == 0: SummaryWriterContext.add_histogram("actor/forward/loc", loc.detach().cpu()) SummaryWriterContext.add_histogram("actor/forward/scale_log", scale_log.detach().cpu()) SummaryWriterContext.add_histogram("actor/forward/log_prob", log_prob.detach().cpu()) SummaryWriterContext.add_histogram( "actor/forward/squash_correction", squash_correction.detach().cpu()) log_prob = torch.sum(log_prob - squash_correction, dim=1) return rlt.ActorOutput(action=action, log_prob=log_prob.reshape(-1, 1), action_mean=loc)
def get_log_prob(self, state, squashed_action): """ Action is expected to be squashed with tanh """ loc, scale_log = self._get_loc_and_scale_log(state) # This is not getting exported; we can use it n = Normal(loc, scale_log.exp()) raw_action = self._atanh(squashed_action) log_prob = n.log_prob(raw_action) squash_correction = self._squash_correction(squashed_action) if SummaryWriterContext._global_step % 1000 == 0: SummaryWriterContext.add_histogram("actor/get_log_prob/loc", loc.detach().cpu()) SummaryWriterContext.add_histogram("actor/get_log_prob/scale_log", scale_log.detach().cpu()) SummaryWriterContext.add_histogram("actor/get_log_prob/log_prob", log_prob.detach().cpu()) SummaryWriterContext.add_histogram( "actor/get_log_prob/squash_correction", squash_correction.detach().cpu()) log_prob = torch.sum(log_prob - squash_correction, dim=1).reshape(-1, 1) return log_prob
def test_writing_stack(self): with TemporaryDirectory() as tmp_dir1, TemporaryDirectory() as tmp_dir2: writer1 = SummaryWriter(tmp_dir1) writer1.add_scalar = MagicMock() writer2 = SummaryWriter(tmp_dir2) writer2.add_scalar = MagicMock() with summary_writer_context(writer1): with summary_writer_context(writer2): SummaryWriterContext.add_scalar("test2", torch.ones(1)) SummaryWriterContext.add_scalar("test1", torch.zeros(1)) writer1.add_scalar.assert_called_once_with( "test1", torch.zeros(1), global_step=0 ) writer2.add_scalar.assert_called_once_with( "test2", torch.ones(1), global_step=0 )
def __init__( self, key: str, category: str, title: str, actions: List[str], log_key_prefix: Optional[str] = None, ): super().__init__(key) self.log_key_prefix = log_key_prefix or f"{category}/{title}" self.actions = actions SummaryWriterContext.add_custom_scalars_multilinechart( [f"{self.log_key_prefix}/{action_name}/mean" for action_name in actions], category=category, title=title, )
def _sample_action(self, loc: torch.Tensor, scale_log: torch.Tensor): r = torch.randn_like(scale_log, device=scale_log.device) action = torch.tanh(loc + r * scale_log.exp()) # Since each dim are independent, log-prob is simply sum log_prob = self.actor_network._log_prob(r, scale_log) squash_correction = self.actor_network._squash_correction(action) if SummaryWriterContext._global_step % 1000 == 0: SummaryWriterContext.add_histogram("actor/forward/loc", loc.detach().cpu()) SummaryWriterContext.add_histogram("actor/forward/scale_log", scale_log.detach().cpu()) SummaryWriterContext.add_histogram("actor/forward/log_prob", log_prob.detach().cpu()) SummaryWriterContext.add_histogram( "actor/forward/squash_correction", squash_correction.detach().cpu()) log_prob = torch.sum(log_prob - squash_correction, dim=1) return action, log_prob.reshape(-1, 1)
def test_global_step(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1)) SummaryWriterContext.increase_global_step() SummaryWriterContext.add_scalar("test", torch.zeros(1)) writer.add_scalar.assert_has_calls([ call("test", torch.ones(1), global_step=0), call("test", torch.zeros(1), global_step=1), ]) self.assertEqual(2, len(writer.add_scalar.mock_calls))
def _log_prob(self, loc: torch.Tensor, scale_log: torch.Tensor, squashed_action: torch.Tensor): # This is not getting exported; we can use it n = torch.distributions.Normal(loc, scale_log.exp()) raw_action = self.actor_network._atanh(squashed_action) log_prob = n.log_prob(raw_action) squash_correction = self.actor_network._squash_correction( squashed_action) if SummaryWriterContext._global_step % 1000 == 0: SummaryWriterContext.add_histogram("actor/get_log_prob/loc", loc.detach().cpu()) SummaryWriterContext.add_histogram("actor/get_log_prob/scale_log", scale_log.detach().cpu()) SummaryWriterContext.add_histogram("actor/get_log_prob/log_prob", log_prob.detach().cpu()) SummaryWriterContext.add_histogram( "actor/get_log_prob/squash_correction", squash_correction.detach().cpu()) log_prob = torch.sum(log_prob - squash_correction, dim=1).reshape(-1, 1) return log_prob
def test_global_step(self): with TemporaryDirectory() as tmp_dir: writer = SummaryWriter(tmp_dir) writer.add_scalar = MagicMock() with summary_writer_context(writer): SummaryWriterContext.add_scalar("test", torch.ones(1)) SummaryWriterContext.increase_global_step() SummaryWriterContext.add_scalar("test", torch.zeros(1)) writer.add_scalar.assert_has_calls( [ call("test", torch.ones(1), global_step=0), call("test", torch.zeros(1), global_step=1), ] ) self.assertEqual(2, len(writer.add_scalar.mock_calls))
def tearDown(self): SummaryWriterContext._reset_globals()
def write_summary(self, actions: List[str]): if actions: for field, log_key in [ ("logged_actions", "actions/logged"), ("model_action_idxs", "actions/model"), ]: val = getattr(self, field) if val is None: continue for i, action in enumerate(actions): SummaryWriterContext.add_scalar( "{}/{}".format(log_key, action), (val == i).sum().item() ) for field, log_key in [ ("td_loss", "td_loss"), ("reward_loss", "reward_loss"), ("logged_propensities", "propensities/logged"), ("logged_rewards", "reward/logged"), ("logged_values", "value/logged"), ("model_values_on_logged_actions", "value/model_logged_action"), ]: val = getattr(self, field) if val is None: continue assert len(val.shape) == 1 or ( len(val.shape) == 2 and val.shape[1] == 1 ), "Unexpected shape for {}: {}".format(field, val.shape) SummaryWriterContext.add_histogram(log_key, val) SummaryWriterContext.add_scalar("{}/mean".format(log_key), val.mean()) for field, log_key in [ ("model_propensities", "propensities/model"), ("model_rewards", "reward/model"), ("model_values", "value/model"), ]: val = getattr(self, field) if val is None: continue if ( len(val.shape) == 1 or (len(val.shape) == 2 and val.shape[1] == 1) ) and not actions: SummaryWriterContext.add_histogram(log_key, val) SummaryWriterContext.add_scalar("{}/mean".format(log_key), val.mean()) elif len(val.shape) == 2 and val.shape[1] == len(actions): for i, action in enumerate(actions): SummaryWriterContext.add_histogram( "{}/{}".format(log_key, action), val[:, i] ) SummaryWriterContext.add_scalar( "{}/{}/mean".format(log_key, action), val[:, i].mean() ) else: raise ValueError( "Unexpected shape for {}: {}; actions: {}".format( field, val.shape, actions ) )
def train(self, training_batch, evaluator=None) -> None: """ IMPORTANT: the input action here is assumed to be preprocessed to match the range of the output of the actor. """ if hasattr(training_batch, "as_parametric_sarsa_training_batch"): training_batch = training_batch.as_parametric_sarsa_training_batch( ) learning_input = training_batch.training_input self.minibatch += 1 state = learning_input.state action = learning_input.action reward = learning_input.reward discount = torch.full_like(reward, self.gamma) not_done_mask = learning_input.not_terminal if self._should_scale_action_in_train(): action = rlt.FeatureVector( rescale_torch_tensor( action.float_features, new_min=self.min_action_range_tensor_training, new_max=self.max_action_range_tensor_training, prev_min=self.min_action_range_tensor_serving, prev_max=self.max_action_range_tensor_serving, )) current_state_action = rlt.StateAction(state=state, action=action) q1_value = self.q1_network(current_state_action).q_value min_q_value = q1_value if self.q2_network: q2_value = self.q2_network(current_state_action).q_value min_q_value = torch.min(q1_value, q2_value) # Use the minimum as target, ensure no gradient going through min_q_value = min_q_value.detach() # # First, optimize value network; minimizing MSE between # V(s) & Q(s, a) - log(pi(a|s)) # state_value = self.value_network(state.float_features) # .q_value if self.logged_action_uniform_prior: log_prob_a = torch.zeros_like(min_q_value) target_value = min_q_value else: with torch.no_grad(): log_prob_a = self.actor_network.get_log_prob( state, action.float_features) log_prob_a = log_prob_a.clamp(-20.0, 20.0) target_value = min_q_value - self.entropy_temperature * log_prob_a value_loss = F.mse_loss(state_value, target_value) self.value_network_optimizer.zero_grad() value_loss.backward() self.value_network_optimizer.step() # # Second, optimize Q networks; minimizing MSE between # Q(s, a) & r + discount * V'(next_s) # with torch.no_grad(): next_state_value = (self.value_network_target( learning_input.next_state.float_features) * not_done_mask) if self.minibatch < self.reward_burnin: target_q_value = reward else: target_q_value = reward + discount * next_state_value q1_loss = F.mse_loss(q1_value, target_q_value) self.q1_network_optimizer.zero_grad() q1_loss.backward() self.q1_network_optimizer.step() if self.q2_network: q2_loss = F.mse_loss(q2_value, target_q_value) self.q2_network_optimizer.zero_grad() q2_loss.backward() self.q2_network_optimizer.step() # # Lastly, optimize the actor; minimizing KL-divergence between action propensity # & softmax of value. Due to reparameterization trick, it ends up being # log_prob(actor_action) - Q(s, actor_action) # actor_output = self.actor_network(rlt.StateInput(state=state)) state_actor_action = rlt.StateAction( state=state, action=rlt.FeatureVector(float_features=actor_output.action)) q1_actor_value = self.q1_network(state_actor_action).q_value min_q_actor_value = q1_actor_value if self.q2_network: q2_actor_value = self.q2_network(state_actor_action).q_value min_q_actor_value = torch.min(q1_actor_value, q2_actor_value) actor_loss = (self.entropy_temperature * actor_output.log_prob - min_q_actor_value) # Do this in 2 steps so we can log histogram of actor loss actor_loss_mean = actor_loss.mean() self.actor_network_optimizer.zero_grad() actor_loss_mean.backward() self.actor_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.value_network, self.value_network_target, 1.0) else: # Use the soft update rule to update both target networks self._soft_update(self.value_network, self.value_network_target, self.tau) # Logging at the end to schedule all the cuda operations first if (self.tensorboard_logging_freq is not None and self.minibatch % self.tensorboard_logging_freq == 0): SummaryWriterContext.add_histogram("q1/logged_state_value", q1_value) if self.q2_network: SummaryWriterContext.add_histogram("q2/logged_state_value", q2_value) SummaryWriterContext.add_histogram("log_prob_a", log_prob_a) SummaryWriterContext.add_histogram("value_network/target", target_value) SummaryWriterContext.add_histogram("q_network/next_state_value", next_state_value) SummaryWriterContext.add_histogram("q_network/target_q_value", target_q_value) SummaryWriterContext.add_histogram("actor/min_q_actor_value", min_q_actor_value) SummaryWriterContext.add_histogram("actor/action_log_prob", actor_output.log_prob) SummaryWriterContext.add_histogram("actor/loss", actor_loss) if evaluator is not None: cpe_stats = BatchStatsForCPE( td_loss=q1_loss.detach().cpu().numpy(), logged_rewards=reward.detach().cpu().numpy(), model_values_on_logged_actions=q1_value.detach().cpu().numpy(), model_propensities=actor_output.log_prob.exp().detach().cpu(). numpy(), model_values=min_q_actor_value.detach().cpu().numpy(), ) evaluator.report(cpe_stats)
def train(self, training_batch) -> None: """ IMPORTANT: the input action here is assumed to be preprocessed to match the range of the output of the actor. """ if hasattr(training_batch, "as_parametric_sarsa_training_batch"): training_batch = training_batch.as_parametric_sarsa_training_batch() learning_input = training_batch.training_input self.minibatch += 1 state = learning_input.state action = learning_input.action reward = learning_input.reward discount = torch.full_like(reward, self.gamma) not_done_mask = learning_input.not_terminal if self._should_scale_action_in_train(): action = rlt.FeatureVector( rescale_torch_tensor( action.float_features, new_min=self.min_action_range_tensor_training, new_max=self.max_action_range_tensor_training, prev_min=self.min_action_range_tensor_serving, prev_max=self.max_action_range_tensor_serving, ) ) current_state_action = rlt.StateAction(state=state, action=action) q1_value = self.q1_network(current_state_action).q_value min_q_value = q1_value if self.q2_network: q2_value = self.q2_network(current_state_action).q_value min_q_value = torch.min(q1_value, q2_value) # Use the minimum as target, ensure no gradient going through min_q_value = min_q_value.detach() # # First, optimize value network; minimizing MSE between # V(s) & Q(s, a) - log(pi(a|s)) # state_value = self.value_network(state.float_features) # .q_value if self.logged_action_uniform_prior: log_prob_a = torch.zeros_like(min_q_value) target_value = min_q_value else: with torch.no_grad(): log_prob_a = self.actor_network.get_log_prob( state, action.float_features ) log_prob_a = log_prob_a.clamp(-20.0, 20.0) target_value = min_q_value - self.entropy_temperature * log_prob_a value_loss = F.mse_loss(state_value, target_value) self.value_network_optimizer.zero_grad() value_loss.backward() self.value_network_optimizer.step() # # Second, optimize Q networks; minimizing MSE between # Q(s, a) & r + discount * V'(next_s) # with torch.no_grad(): next_state_value = ( self.value_network_target(learning_input.next_state.float_features) * not_done_mask.float() ) if self.minibatch < self.reward_burnin: target_q_value = reward else: target_q_value = reward + discount * next_state_value q1_loss = F.mse_loss(q1_value, target_q_value) self.q1_network_optimizer.zero_grad() q1_loss.backward() self.q1_network_optimizer.step() if self.q2_network: q2_loss = F.mse_loss(q2_value, target_q_value) self.q2_network_optimizer.zero_grad() q2_loss.backward() self.q2_network_optimizer.step() # # Lastly, optimize the actor; minimizing KL-divergence between action propensity # & softmax of value. Due to reparameterization trick, it ends up being # log_prob(actor_action) - Q(s, actor_action) # actor_output = self.actor_network(rlt.StateInput(state=state)) state_actor_action = rlt.StateAction( state=state, action=rlt.FeatureVector(float_features=actor_output.action) ) q1_actor_value = self.q1_network(state_actor_action).q_value min_q_actor_value = q1_actor_value if self.q2_network: q2_actor_value = self.q2_network(state_actor_action).q_value min_q_actor_value = torch.min(q1_actor_value, q2_actor_value) actor_loss = ( self.entropy_temperature * actor_output.log_prob - min_q_actor_value ) # Do this in 2 steps so we can log histogram of actor loss actor_loss_mean = actor_loss.mean() self.actor_network_optimizer.zero_grad() actor_loss_mean.backward() self.actor_network_optimizer.step() if self.minibatch < self.reward_burnin: # Reward burnin: force target network self._soft_update(self.value_network, self.value_network_target, 1.0) else: # Use the soft update rule to update both target networks self._soft_update(self.value_network, self.value_network_target, self.tau) # Logging at the end to schedule all the cuda operations first if ( self.tensorboard_logging_freq is not None and self.minibatch % self.tensorboard_logging_freq == 0 ): SummaryWriterContext.add_histogram("q1/logged_state_value", q1_value) if self.q2_network: SummaryWriterContext.add_histogram("q2/logged_state_value", q2_value) SummaryWriterContext.add_histogram("log_prob_a", log_prob_a) SummaryWriterContext.add_histogram("value_network/target", target_value) SummaryWriterContext.add_histogram( "q_network/next_state_value", next_state_value ) SummaryWriterContext.add_histogram( "q_network/target_q_value", target_q_value ) SummaryWriterContext.add_histogram( "actor/min_q_actor_value", min_q_actor_value ) SummaryWriterContext.add_histogram( "actor/action_log_prob", actor_output.log_prob ) SummaryWriterContext.add_histogram("actor/loss", actor_loss) self.loss_reporter.report( td_loss=float(q1_loss), reward_loss=None, logged_rewards=reward, model_values_on_logged_actions=q1_value, model_propensities=actor_output.log_prob.exp(), model_values=min_q_actor_value, )
def setUp(self): logging.getLogger().setLevel(logging.INFO) SummaryWriterContext._reset_globals() np.random.seed(SEED) torch.manual_seed(SEED) random.seed(SEED)
def train(self, training_batch) -> None: """ IMPORTANT: the input action here is assumed to be preprocessed to match the range of the output of the actor. """ if hasattr(training_batch, "as_parametric_sarsa_training_batch"): training_batch = training_batch.as_parametric_sarsa_training_batch( ) learning_input = training_batch.training_input self.minibatch += 1 state = learning_input.state action = learning_input.action reward = learning_input.reward discount = torch.full_like(reward, self.gamma) not_done_mask = learning_input.not_terminal if self._should_scale_action_in_train(): action = rlt.FeatureVector( rescale_torch_tensor( action.float_features, new_min=self.min_action_range_tensor_training, new_max=self.max_action_range_tensor_training, prev_min=self.min_action_range_tensor_serving, prev_max=self.max_action_range_tensor_serving, )) with torch.enable_grad(): # # First, optimize Q networks; minimizing MSE between # Q(s, a) & r + discount * V'(next_s) # current_state_action = rlt.StateAction(state=state, action=action) q1_value = self.q1_network(current_state_action).q_value if self.q2_network: q2_value = self.q2_network(current_state_action).q_value actor_output = self.actor_network(rlt.StateInput(state=state)) # Optimize Alpha if self.alpha_optimizer is not None: alpha_loss = -(self.log_alpha * (actor_output.log_prob + self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.entropy_temperature = self.log_alpha.exp() with torch.no_grad(): if self.value_network is not None: next_state_value = self.value_network_target( learning_input.next_state.float_features) else: next_state_actor_output = self.actor_network( rlt.StateInput(state=learning_input.next_state)) next_state_actor_action = rlt.StateAction( state=learning_input.next_state, action=rlt.FeatureVector( float_features=next_state_actor_output.action), ) next_state_value = self.q1_network_target( next_state_actor_action).q_value if self.q2_network is not None: target_q2_value = self.q2_network_target( next_state_actor_action).q_value next_state_value = torch.min(next_state_value, target_q2_value) log_prob_a = self.actor_network.get_log_prob( learning_input.next_state, next_state_actor_output.action) log_prob_a = log_prob_a.clamp(-20.0, 20.0) next_state_value -= self.entropy_temperature * log_prob_a target_q_value = ( reward + discount * next_state_value * not_done_mask.float()) q1_loss = F.mse_loss(q1_value, target_q_value) q1_loss.backward() self._maybe_run_optimizer(self.q1_network_optimizer, self.minibatches_per_step) if self.q2_network: q2_loss = F.mse_loss(q2_value, target_q_value) q2_loss.backward() self._maybe_run_optimizer(self.q2_network_optimizer, self.minibatches_per_step) # # Second, optimize the actor; minimizing KL-divergence between action propensity # & softmax of value. Due to reparameterization trick, it ends up being # log_prob(actor_action) - Q(s, actor_action) # state_actor_action = rlt.StateAction( state=state, action=rlt.FeatureVector(float_features=actor_output.action), ) q1_actor_value = self.q1_network(state_actor_action).q_value min_q_actor_value = q1_actor_value if self.q2_network: q2_actor_value = self.q2_network(state_actor_action).q_value min_q_actor_value = torch.min(q1_actor_value, q2_actor_value) actor_loss = (self.entropy_temperature * actor_output.log_prob - min_q_actor_value) # Do this in 2 steps so we can log histogram of actor loss actor_loss_mean = actor_loss.mean() actor_loss_mean.backward() self._maybe_run_optimizer(self.actor_network_optimizer, self.minibatches_per_step) # # Lastly, if applicable, optimize value network; minimizing MSE between # V(s) & E_a~pi(s) [ Q(s,a) - log(pi(a|s)) ] # if self.value_network is not None: state_value = self.value_network(state.float_features) if self.logged_action_uniform_prior: log_prob_a = torch.zeros_like(min_q_actor_value) target_value = min_q_actor_value else: with torch.no_grad(): log_prob_a = actor_output.log_prob log_prob_a = log_prob_a.clamp(-20.0, 20.0) target_value = (min_q_actor_value - self.entropy_temperature * log_prob_a) value_loss = F.mse_loss(state_value, target_value.detach()) value_loss.backward() self._maybe_run_optimizer(self.value_network_optimizer, self.minibatches_per_step) # Use the soft update rule to update the target networks if self.value_network is not None: self._maybe_soft_update( self.value_network, self.value_network_target, self.tau, self.minibatches_per_step, ) else: self._maybe_soft_update( self.q1_network, self.q1_network_target, self.tau, self.minibatches_per_step, ) if self.q2_network is not None: self._maybe_soft_update( self.q2_network, self.q2_network_target, self.tau, self.minibatches_per_step, ) # Logging at the end to schedule all the cuda operations first if (self.tensorboard_logging_freq is not None and self.minibatch % self.tensorboard_logging_freq == 0): SummaryWriterContext.add_histogram("q1/logged_state_value", q1_value) if self.q2_network: SummaryWriterContext.add_histogram("q2/logged_state_value", q2_value) SummaryWriterContext.add_histogram("log_prob_a", log_prob_a) if self.value_network: SummaryWriterContext.add_histogram("value_network/target", target_value) SummaryWriterContext.add_histogram("q_network/next_state_value", next_state_value) SummaryWriterContext.add_histogram("q_network/target_q_value", target_q_value) SummaryWriterContext.add_histogram("actor/min_q_actor_value", min_q_actor_value) SummaryWriterContext.add_histogram("actor/action_log_prob", actor_output.log_prob) SummaryWriterContext.add_histogram("actor/loss", actor_loss) self.loss_reporter.report( td_loss=float(q1_loss), reward_loss=None, logged_rewards=reward, model_values_on_logged_actions=q1_value, model_propensities=actor_output.log_prob.exp(), model_values=min_q_actor_value, )
def setUp(self): SummaryWriterContext._reset_globals()
def dist(self, input: rlt.PreprocessedState): state = input.state.float_features x = state for i, activation in enumerate(self.activations[:-1]): if self.use_batch_norm: x = self.batch_norm_ops[i](x) x = self.layers[i](x) if activation == "linear": continue elif activation == "tanh": activation_func = torch.tanh else: activation_func = getattr(F, activation) x = activation_func(x) value = self.value(x).unsqueeze(dim=1) raw_advantage = self.advantage(x).reshape(-1, self.num_actions, self.num_atoms) advantage = raw_advantage - raw_advantage.mean(dim=1, keepdim=True) q_value = value + advantage if SummaryWriterContext._global_step % 1000 == 0: SummaryWriterContext.add_histogram( "dueling_network/{}/value".format(self._name), value.detach().mean(dim=2).cpu(), ) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_value".format(self._name), value.detach().mean().cpu(), ) SummaryWriterContext.add_histogram( "dueling_network/{}/q_value".format(self._name), q_value.detach().mean(dim=2).cpu(), ) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_q_value".format(self._name), q_value.detach().mean().cpu(), ) SummaryWriterContext.add_histogram( "dueling_network/{}/raw_advantage".format(self._name), raw_advantage.detach().mean(dim=2).cpu(), ) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_raw_advantage".format(self._name), raw_advantage.detach().mean().cpu(), ) for i in range(advantage.shape[1]): a = advantage.detach()[:, i, :].mean(dim=1) SummaryWriterContext.add_histogram( "dueling_network/{}/advantage/{}".format(self._name, i), a.cpu()) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_advantage/{}".format( self._name, i), a.mean().cpu(), ) return q_value
def test_with_none(self): with summary_writer_context(None): self.assertIsNone( SummaryWriterContext.add_scalar("test", torch.ones(1)))
def forward(self, input) -> Union[NamedTuple, torch.FloatTensor]: # type: ignore output_tensor = False if self.parametric_action: state = input.state.float_features action = input.action.float_features else: state = input.state.float_features action = None x = state for i, activation in enumerate(self.activations[:-1]): if self.use_batch_norm: x = self.batch_norm_ops[i](x) x = self.layers[i](x) if activation == "linear": continue elif activation == "tanh": activation_func = torch.tanh else: activation_func = getattr(F, activation) x = activation_func(x) value = self.value(x) if action is not None: x = torch.cat((x, action), dim=1) raw_advantage = self.advantage(x) if self.parametric_action: advantage = raw_advantage else: advantage = raw_advantage - raw_advantage.mean(dim=1, keepdim=True) q_value = value + advantage if SummaryWriterContext._global_step % 1000 == 0: SummaryWriterContext.add_histogram( "dueling_network/{}/value".format(self._name), value.detach().cpu()) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_value".format(self._name), value.detach().mean().cpu(), ) SummaryWriterContext.add_histogram( "dueling_network/{}/q_value".format(self._name), q_value.detach().cpu()) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_q_value".format(self._name), q_value.detach().mean().cpu(), ) SummaryWriterContext.add_histogram( "dueling_network/{}/raw_advantage".format(self._name), raw_advantage.detach().cpu(), ) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_raw_advantage".format(self._name), raw_advantage.detach().mean().cpu(), ) if not self.parametric_action: for i in range(advantage.shape[1]): a = advantage.detach()[:, i] SummaryWriterContext.add_histogram( "dueling_network/{}/advantage/{}".format( self._name, i), a.cpu()) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_advantage/{}".format( self._name, i), a.mean().cpu(), ) if output_tensor: return q_value # type: ignore elif self.parametric_action: return rlt.SingleQValue(q_value=q_value) # type: ignore else: return rlt.AllActionQValues(q_values=q_value) # type: ignore
def train(self, training_batch) -> None: """ IMPORTANT: the input action here is assumed to be preprocessed to match the range of the output of the actor. """ if hasattr(training_batch, "as_parametric_sarsa_training_batch"): training_batch = training_batch.as_parametric_sarsa_training_batch( ) learning_input = training_batch.training_input self.minibatch += 1 state = learning_input.state action = learning_input.action next_state = learning_input.next_state reward = learning_input.reward not_done_mask = learning_input.not_terminal action = self._maybe_scale_action_in_train(action) # Compute current value estimates current_state_action = rlt.StateAction(state=state, action=action) q1_value = self.q1_network(current_state_action).q_value if self.q2_network: q2_value = self.q2_network(current_state_action).q_value actor_action = self.actor_network(rlt.StateInput(state=state)).action # Generate target = r + y * min (Q1(s',pi(s')), Q2(s',pi(s'))) with torch.no_grad(): next_actor = self.actor_network_target( rlt.StateInput(state=next_state)).action next_actor += (torch.randn_like(next_actor) * self.target_policy_smoothing).clamp( -self.noise_clip, self.noise_clip) next_actor = torch.max( torch.min(next_actor, self.max_action_range_tensor_training), self.min_action_range_tensor_training, ) next_state_actor = rlt.StateAction( state=next_state, action=rlt.FeatureVector(float_features=next_actor)) next_state_value = self.q1_network_target(next_state_actor).q_value if self.q2_network is not None: next_state_value = torch.min( next_state_value, self.q2_network_target(next_state_actor).q_value) target_q_value = ( reward + self.gamma * next_state_value * not_done_mask.float()) # Optimize Q1 and Q2 q1_loss = F.mse_loss(q1_value, target_q_value) q1_loss.backward() self._maybe_run_optimizer(self.q1_network_optimizer, self.minibatches_per_step) if self.q2_network: q2_loss = F.mse_loss(q2_value, target_q_value) q2_loss.backward() self._maybe_run_optimizer(self.q2_network_optimizer, self.minibatches_per_step) # Only update actor and target networks after a fixed number of Q updates if self.minibatch % self.delayed_policy_update == 0: actor_loss = -self.q1_network( rlt.StateAction( state=state, action=rlt.FeatureVector( float_features=actor_action))).q_value.mean() actor_loss.backward() self._maybe_run_optimizer(self.actor_network_optimizer, self.minibatches_per_step) # Use the soft update rule to update the target networks self._maybe_soft_update( self.q1_network, self.q1_network_target, self.tau, self.minibatches_per_step, ) self._maybe_soft_update( self.actor_network, self.actor_network_target, self.tau, self.minibatches_per_step, ) if self.q2_network is not None: self._maybe_soft_update( self.q2_network, self.q2_network_target, self.tau, self.minibatches_per_step, ) # Logging at the end to schedule all the cuda operations first if (self.tensorboard_logging_freq is not None and self.minibatch % self.tensorboard_logging_freq == 0): SummaryWriterContext.add_histogram("q1/logged_state_value", q1_value) if self.q2_network: SummaryWriterContext.add_histogram("q2/logged_state_value", q2_value) SummaryWriterContext.add_histogram("q_network/next_state_value", next_state_value) SummaryWriterContext.add_histogram("q_network/target_q_value", target_q_value) SummaryWriterContext.add_histogram("actor/loss", actor_loss) self.loss_reporter.report( td_loss=float(q1_loss), reward_loss=None, logged_rewards=reward, model_values_on_logged_actions=q1_value, )
def handle(self, tdp: TrainingDataPage) -> None: SummaryWriterContext.increase_global_step() self.trainer.train(tdp)
def test_with_none(self): with summary_writer_context(None): self.assertIsNone(SummaryWriterContext.add_scalar("test", torch.ones(1)))
def evaluate_batch(self): merged_inputs = [] for batch in self.all_batches: if len(batch) > 0: merged_inputs.append(np.vstack(batch)) else: merged_inputs.append(None) ( td_loss, logged_actions, logged_propensities, logged_rewards, logged_values, model_propensities, model_rewards, model_values, model_values_on_logged_actions, model_action_idxs, ) = merged_inputs logger.info("Evaluating on {} batches".format(len( self.td_loss_batches))) print_details = "Evaluator:\n" if td_loss is not None: SummaryWriterContext.add_histogram("td_loss", td_loss) td_loss_mean = float(np.mean(td_loss)) SummaryWriterContext.add_scalar("td_loss/mean", td_loss_mean) self.td_loss.append(td_loss_mean) print_details = print_details + "TD LOSS: {0:.3f}\n".format( td_loss_mean) if logged_rewards is not None: SummaryWriterContext.add_histogram("reward/logged", logged_rewards) SummaryWriterContext.add_scalar("reward/logged/mean", logged_rewards.mean()) if model_rewards is not None: SummaryWriterContext.add_histogram("reward/model", model_rewards) SummaryWriterContext.add_scalar("reward/model/mean", model_rewards.mean()) if logged_values is not None: SummaryWriterContext.add_histogram("value/logged", logged_values) SummaryWriterContext.add_scalar("value/logged/mean", logged_values.mean()) if model_values is not None: SummaryWriterContext.add_histogram("value/model", model_values) SummaryWriterContext.add_scalar("value/model/mean", model_values.mean()) if model_values_on_logged_actions is not None: SummaryWriterContext.add_histogram("value/model_logged_action", model_values_on_logged_actions) # TODO: log summary of logged propensities if model_propensities is not None and self.action_names: if len(model_propensities.shape) == 1: SummaryWriterContext.add_histogram("propensities/model", model_propensities) SummaryWriterContext.add_scalar("propensities/model/mean", model_propensities.mean()) if len(model_propensities.shape) == 2: for i, action_name in enumerate(self.action_names): SummaryWriterContext.add_histogram( "propensities/model/{}".format(action_name), model_propensities[:, i], ) SummaryWriterContext.add_scalar( "propensities/model/{}/mean".format(action_name), model_propensities[:, i].mean(), ) if logged_actions is not None and model_action_idxs is not None: logged_action_distr, logged_action_counts = self._get_batch_logged_actions( [logged_actions]) model_action_distr, model_action_counts = self._get_batch_model_actions( [model_action_idxs]) print_details += "The distribution of logged actions : {}\n".format( logged_action_counts) print_details += "The distribution of model actions : {}\n".format( model_action_counts) for action, count in logged_action_counts.items(): self.logged_action_counts[action] += count for action, count in model_action_counts.items(): self.model_action_counts[action].append(count) self.model_action_counts_cumulative[action] += count for action, val in model_action_distr.items(): self.model_action_distr[action].append(val) # Log to tensorboard for action_name, count in logged_action_counts.items(): SummaryWriterContext.add_scalar( "actions/logged/{}".format(action_name), count) for action_name, count in model_action_counts.items(): SummaryWriterContext.add_scalar( "actions/model/{}".format(action_name), count) print_details += "Batch Evaluator Finished" for print_detail in print_details.split("\n"): logger.info(print_detail)
def test_noop(self): self.assertIsNone(SummaryWriterContext.add_scalar("test", torch.ones(1)))
def test_noop(self): self.assertIsNone( SummaryWriterContext.add_scalar("test", torch.ones(1)))
def handle(self, tdp: PreprocessedTrainingBatch) -> None: SummaryWriterContext.increase_global_step() self.trainer_or_evaluator.train(tdp)
def aggregate(self, values): for i, action in enumerate(self.actions): SummaryWriterContext.add_scalar( f"{self.log_key}/{action}", (values == i).sum().item() )
def forward(self, input) -> torch.FloatTensor: state_dim = self.layers[0].in_features state = input[:, :state_dim] action = input[:, state_dim:] x = state for i, activation in enumerate(self.activations[:-1]): if self.use_batch_norm: x = self.batch_norm_ops[i](x) activation_func = getattr(F, activation) fc_func = self.layers[i] x = fc_func(x) if activation == "linear" else activation_func(fc_func(x)) value = self.value(x) x = torch.cat((x, action), dim=1) raw_advantage = self.advantage(x) if self.parametric_action: advantage = raw_advantage else: advantage = raw_advantage - raw_advantage.mean(dim=1, keepdim=True) q_value = value + advantage if SummaryWriterContext._global_step % 1000 == 0: SummaryWriterContext.add_histogram( "dueling_network/{}/value".format(self._name), value.detach().cpu() ) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_value".format(self._name), value.detach().mean().cpu(), ) SummaryWriterContext.add_histogram( "dueling_network/{}/q_value".format(self._name), q_value.detach().cpu() ) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_q_value".format(self._name), q_value.detach().mean().cpu(), ) SummaryWriterContext.add_histogram( "dueling_network/{}/raw_advantage".format(self._name), raw_advantage.detach().cpu(), ) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_raw_advantage".format(self._name), raw_advantage.detach().mean().cpu(), ) if not self.parametric_action: for i in range(advantage.shape[1]): a = advantage.detach()[:, i] SummaryWriterContext.add_histogram( "dueling_network/{}/advatage/{}".format(self._name, i), a.cpu() ) SummaryWriterContext.add_scalar( "dueling_network/{}/mean_advatage/{}".format(self._name, i), a.mean().cpu(), ) return q_value