示例#1
0
 def test_swallowing_exception(self):
     with TemporaryDirectory() as tmp_dir:
         writer = SummaryWriter(tmp_dir)
         writer.add_scalar = MagicMock(side_effect=NotImplementedError("test"))
         writer.exceptions_to_ignore = (NotImplementedError, KeyError)
         with summary_writer_context(writer):
             SummaryWriterContext.add_scalar("test", torch.ones(1))
示例#2
0
    def test_add_custom_scalars(self):
        with TemporaryDirectory() as tmp_dir:
            writer = SummaryWriter(tmp_dir)
            writer.add_custom_scalars = MagicMock()
            with summary_writer_context(writer):
                SummaryWriterContext.add_custom_scalars_multilinechart(
                    ["a", "b"], category="cat", title="title"
                )
                with self.assertRaisesRegexp(
                    AssertionError, "Title \(title\) is already in category \(cat\)"
                ):
                    SummaryWriterContext.add_custom_scalars_multilinechart(
                        ["c", "d"], category="cat", title="title"
                    )
                SummaryWriterContext.add_custom_scalars_multilinechart(
                    ["e", "f"], category="cat", title="title2"
                )
                SummaryWriterContext.add_custom_scalars_multilinechart(
                    ["g", "h"], category="cat2", title="title"
                )

            SummaryWriterContext.add_custom_scalars(writer)
            writer.add_custom_scalars.assert_called_once_with(
                {
                    "cat": {
                        "title": ["Multiline", ["a", "b"]],
                        "title2": ["Multiline", ["e", "f"]],
                    },
                    "cat2": {"title": ["Multiline", ["g", "h"]]},
                }
            )
示例#3
0
 def test_not_swallowing_exception(self):
     with TemporaryDirectory() as tmp_dir:
         writer = SummaryWriter(tmp_dir)
         writer.add_scalar = MagicMock(side_effect=NotImplementedError("test"))
         with self.assertRaisesRegexp(
             NotImplementedError, "test"
         ), summary_writer_context(writer):
             SummaryWriterContext.add_scalar("test", torch.ones(1))
示例#4
0
 def test_writing(self):
     with TemporaryDirectory() as tmp_dir:
         writer = SummaryWriter(tmp_dir)
         writer.add_scalar = MagicMock()
         with summary_writer_context(writer):
             SummaryWriterContext.add_scalar("test", torch.ones(1))
         writer.add_scalar.assert_called_once_with(
             "test", torch.ones(1), global_step=0
         )
示例#5
0
 def test_writing(self):
     with TemporaryDirectory() as tmp_dir:
         writer = SummaryWriter(tmp_dir)
         writer.add_scalar = MagicMock()
         with summary_writer_context(writer):
             SummaryWriterContext.add_scalar("test", torch.ones(1))
         writer.add_scalar.assert_called_once_with("test",
                                                   torch.ones(1),
                                                   global_step=0)
示例#6
0
 def test_not_swallowing_exception(self):
     with TemporaryDirectory() as tmp_dir:
         writer = SummaryWriter(tmp_dir)
         writer.add_scalar = MagicMock(
             side_effect=NotImplementedError("test"))
         with self.assertRaisesRegex(
                 NotImplementedError,
                 "test"), summary_writer_context(writer):
             SummaryWriterContext.add_scalar("test", torch.ones(1))
示例#7
0
    def train_network(self, train_dataset, eval_dataset, epochs: int):
        num_batches = int(len(train_dataset) / self.minibatch_size)
        logger.info(
            "Read in batch data set of size {} examples. Data split "
            "into {} batches of size {}.".format(
                len(train_dataset), num_batches, self.minibatch_size
            )
        )

        start_time = time.time()
        for epoch in range(epochs):
            train_dataset.reset_iterator()
            data_streamer = DataStreamer(train_dataset, pin_memory=self.trainer.use_gpu)
            preprocess_handler = self.preprocess_handler
            dtype = self.trainer.dtype

            def preprocess(batch):
                tdp = preprocess_handler.preprocess(batch)
                tdp.set_type(dtype)
                return tdp

            feed_pages(
                data_streamer,
                len(train_dataset),
                epoch,
                self.minibatch_size,
                self.trainer.use_gpu,
                TrainingPageHandler(self.trainer),
                batch_preprocessor=preprocess,
            )

            if hasattr(self.trainer, "q_network_cpe"):
                # TODO: Add CPE support to DDPG/SAC, Parametric DQN (once moved to modular)
                eval_dataset.reset_iterator()
                data_streamer = DataStreamer(
                    eval_dataset, pin_memory=self.trainer.use_gpu
                )
                eval_page_handler = EvaluationPageHandler(
                    self.trainer, self.evaluator, self
                )
                feed_pages(
                    data_streamer,
                    len(eval_dataset),
                    epoch,
                    self.minibatch_size,
                    self.trainer.use_gpu,
                    eval_page_handler,
                    batch_preprocessor=preprocess,
                )

                SummaryWriterContext.increase_global_step()

        through_put = (len(train_dataset) * epochs) / (time.time() - start_time)
        logger.info(
            "Training finished. Processed ~{} examples / s.".format(round(through_put))
        )
示例#8
0
 def _log_histogram_and_mean(self, log_key, val):
     try:
         SummaryWriterContext.add_histogram(log_key, val)
         SummaryWriterContext.add_scalar(f"{log_key}/mean", val.mean())
     except ValueError:
         logger.warning(
             f"Cannot create histogram for key: {log_key}; "
             "this is likely because you have NULL value in your input; "
             f"value: {val}")
         raise
示例#9
0
    def log_to_tensorboard(self, epoch: int) -> None:
        def none_to_zero(x: Optional[float]) -> float:
            if x is None or math.isnan(x):
                return 0.0
            return x

        for name, value in [
            ("Training/td_loss", self.get_recent_td_loss()),
            ("Training/reward_loss", self.get_recent_reward_loss()),
            ("Training/imitator_loss", self.get_recent_imitator_loss()),
        ]:
            SummaryWriterContext.add_scalar(name, none_to_zero(value), epoch)
示例#10
0
 def test_writing_stack(self):
     with TemporaryDirectory() as tmp_dir1, TemporaryDirectory(
     ) as tmp_dir2:
         writer1 = SummaryWriter(tmp_dir1)
         writer1.add_scalar = MagicMock()
         writer2 = SummaryWriter(tmp_dir2)
         writer2.add_scalar = MagicMock()
         with summary_writer_context(writer1):
             with summary_writer_context(writer2):
                 SummaryWriterContext.add_scalar("test2", torch.ones(1))
             SummaryWriterContext.add_scalar("test1", torch.zeros(1))
         writer1.add_scalar.assert_called_once_with("test1", torch.zeros(1))
         writer2.add_scalar.assert_called_once_with("test2", torch.ones(1))
    def test_minibatches_per_step(self):
        _epochs = self.epochs
        self.epochs = 2
        rl_parameters = RLParameters(gamma=0.95,
                                     target_update_rate=0.9,
                                     maxq_learning=True)
        rainbow_parameters = RainbowDQNParameters(double_q_learning=True,
                                                  dueling_architecture=False)
        training_parameters1 = TrainingParameters(
            layers=self.layers,
            activations=self.activations,
            minibatch_size=1024,
            minibatches_per_step=1,
            learning_rate=0.25,
            optimizer="ADAM",
        )
        training_parameters2 = TrainingParameters(
            layers=self.layers,
            activations=self.activations,
            minibatch_size=128,
            minibatches_per_step=8,
            learning_rate=0.25,
            optimizer="ADAM",
        )
        env1 = Env(self.state_dims, self.action_dims)
        env2 = Env(self.state_dims, self.action_dims)
        model_parameters1 = DiscreteActionModelParameters(
            actions=env1.actions,
            rl=rl_parameters,
            rainbow=rainbow_parameters,
            training=training_parameters1,
        )
        model_parameters2 = DiscreteActionModelParameters(
            actions=env2.actions,
            rl=rl_parameters,
            rainbow=rainbow_parameters,
            training=training_parameters2,
        )
        # minibatch_size / 8, minibatches_per_step * 8 should give the same result
        logger.info("Training model 1")
        trainer1 = self._train(model_parameters1, env1)
        SummaryWriterContext._reset_globals()
        logger.info("Training model 2")
        trainer2 = self._train(model_parameters2, env2)

        weight1 = trainer1.q_network.fc.layers[-1].weight.detach().numpy()
        weight2 = trainer2.q_network.fc.layers[-1].weight.detach().numpy()

        # Due to numerical stability this tolerance has to be fairly high
        self.assertTrue(np.allclose(weight1, weight2, rtol=0.0, atol=1e-3))
        self.epochs = _epochs
示例#12
0
    def write_summary(self, actions: List[str]):
        if actions:
            for field, log_key in [
                ("logged_actions", "actions/logged"),
                ("model_action_idxs", "actions/model"),
            ]:
                val = getattr(self, field)
                if val is None:
                    continue
                for i, action in enumerate(actions):
                    SummaryWriterContext.add_scalar(
                        "{}/{}".format(log_key, action),
                        (val == i).sum().item())

        for field, log_key in [
            ("td_loss", "td_loss"),
            ("imitator_loss", "imitator_loss"),
            ("reward_loss", "reward_loss"),
            ("logged_propensities", "propensities/logged"),
            ("logged_rewards", "reward/logged"),
            ("logged_values", "value/logged"),
            ("model_values_on_logged_actions", "value/model_logged_action"),
        ]:
            val = getattr(self, field)
            if val is None:
                continue
            assert len(val.shape) == 1 or (
                len(val.shape) == 2
                and val.shape[1] == 1), "Unexpected shape for {}: {}".format(
                    field, val.shape)
            self._log_histogram_and_mean(log_key, val)

        for field, log_key in [
            ("model_propensities", "propensities/model"),
            ("model_rewards", "reward/model"),
            ("model_values", "value/model"),
        ]:
            val = getattr(self, field)
            if val is None:
                continue
            if (len(val.shape) == 1 or
                (len(val.shape) == 2 and val.shape[1] == 1)) and not actions:
                self._log_histogram_and_mean(log_key, val)
            elif len(val.shape) == 2 and val.shape[1] == len(actions):
                for i, action in enumerate(actions):
                    self._log_histogram_and_mean(f"{log_key}/{action}", val[:,
                                                                            i])
            else:
                raise ValueError(
                    "Unexpected shape for {}: {}; actions: {}".format(
                        field, val.shape, actions))
示例#13
0
    def add_custom_scalars(action_names: Optional[List[str]]):
        if not action_names:
            return

        SummaryWriterContext.add_custom_scalars_multilinechart(
            [
                "propensities/model/{}/mean".format(action_name)
                for action_name in action_names
            ],
            category="propensities",
            title="model",
        )
        SummaryWriterContext.add_custom_scalars_multilinechart(
            [
                "propensities/logged/{}/mean".format(action_name)
                for action_name in action_names
            ],
            category="propensities",
            title="logged",
        )
        SummaryWriterContext.add_custom_scalars_multilinechart(
            ["actions/logged/{}".format(action_name) for action_name in action_names],
            category="actions",
            title="logged",
        )
        SummaryWriterContext.add_custom_scalars_multilinechart(
            ["actions/model/{}".format(action_name) for action_name in action_names],
            category="actions",
            title="model",
        )
示例#14
0
    def flush(self):
        logger.info("Loss on {} batches".format(len(self.incoming_td_loss)))
        print_details = "Loss:\n"

        td_loss = torch.tensor(self.incoming_td_loss)
        SummaryWriterContext.add_histogram("td_loss", td_loss)
        td_loss_mean = float(td_loss.mean())
        SummaryWriterContext.add_scalar("td_loss/mean", td_loss_mean)
        self.td_loss.append(td_loss_mean)
        print_details = print_details + "TD LOSS: {0:.3f}\n".format(
            td_loss_mean)

        if len(self.incoming_reward_loss) > 0:
            reward_loss = torch.tensor(self.incoming_reward_loss)
            SummaryWriterContext.add_histogram("reward_loss", reward_loss)
            reward_loss_mean = float(reward_loss.mean())
            SummaryWriterContext.add_scalar("reward_loss/mean",
                                            reward_loss_mean)
            self.reward_loss.append(reward_loss_mean)
            print_details = print_details + "REWARD LOSS: {0:.3f}\n".format(
                reward_loss_mean)

        for print_detail in print_details.split("\n"):
            logger.info(print_detail)

        self.incoming_td_loss.clear()
        self.incoming_reward_loss.clear()
示例#15
0
    def add_custom_scalars(action_names: Optional[List[str]]):
        if not action_names:
            return

        SummaryWriterContext.add_custom_scalars_multilinechart(
            [
                "propensities/model/{}/mean".format(action_name)
                for action_name in action_names
            ],
            category="propensities",
            title="model",
        )
        SummaryWriterContext.add_custom_scalars_multilinechart(
            [
                "propensities/logged/{}/mean".format(action_name)
                for action_name in action_names
            ],
            category="propensities",
            title="logged",
        )
        SummaryWriterContext.add_custom_scalars_multilinechart(
            ["actions/logged/{}".format(action_name) for action_name in action_names],
            category="actions",
            title="logged",
        )
        SummaryWriterContext.add_custom_scalars_multilinechart(
            ["actions/model/{}".format(action_name) for action_name in action_names],
            category="actions",
            title="model",
        )
示例#16
0
文件: actor.py 项目: joshrose/Horizon
    def forward(self, input):
        loc, scale_log = self._get_loc_and_scale_log(input.state)
        r = torch.randn_like(scale_log, device=scale_log.device)
        action = torch.tanh(loc + r * scale_log.exp())
        if not self.training:
            # ONNX doesn't like reshape either..
            return rlt.ActorOutput(action=action)
        # Since each dim are independent, log-prob is simply sum
        log_prob = self._log_prob(r, scale_log)
        squash_correction = self._squash_correction(action)
        if SummaryWriterContext._global_step % 1000 == 0:
            SummaryWriterContext.add_histogram("actor/forward/loc",
                                               loc.detach().cpu())
            SummaryWriterContext.add_histogram("actor/forward/scale_log",
                                               scale_log.detach().cpu())
            SummaryWriterContext.add_histogram("actor/forward/log_prob",
                                               log_prob.detach().cpu())
            SummaryWriterContext.add_histogram(
                "actor/forward/squash_correction",
                squash_correction.detach().cpu())
        log_prob = torch.sum(log_prob - squash_correction, dim=1)

        return rlt.ActorOutput(action=action,
                               log_prob=log_prob.reshape(-1, 1),
                               action_mean=loc)
示例#17
0
文件: actor.py 项目: joshrose/Horizon
    def get_log_prob(self, state, squashed_action):
        """
        Action is expected to be squashed with tanh
        """
        loc, scale_log = self._get_loc_and_scale_log(state)
        # This is not getting exported; we can use it
        n = Normal(loc, scale_log.exp())
        raw_action = self._atanh(squashed_action)

        log_prob = n.log_prob(raw_action)
        squash_correction = self._squash_correction(squashed_action)
        if SummaryWriterContext._global_step % 1000 == 0:
            SummaryWriterContext.add_histogram("actor/get_log_prob/loc",
                                               loc.detach().cpu())
            SummaryWriterContext.add_histogram("actor/get_log_prob/scale_log",
                                               scale_log.detach().cpu())
            SummaryWriterContext.add_histogram("actor/get_log_prob/log_prob",
                                               log_prob.detach().cpu())
            SummaryWriterContext.add_histogram(
                "actor/get_log_prob/squash_correction",
                squash_correction.detach().cpu())
        log_prob = torch.sum(log_prob - squash_correction,
                             dim=1).reshape(-1, 1)

        return log_prob
示例#18
0
 def test_writing_stack(self):
     with TemporaryDirectory() as tmp_dir1, TemporaryDirectory() as tmp_dir2:
         writer1 = SummaryWriter(tmp_dir1)
         writer1.add_scalar = MagicMock()
         writer2 = SummaryWriter(tmp_dir2)
         writer2.add_scalar = MagicMock()
         with summary_writer_context(writer1):
             with summary_writer_context(writer2):
                 SummaryWriterContext.add_scalar("test2", torch.ones(1))
             SummaryWriterContext.add_scalar("test1", torch.zeros(1))
         writer1.add_scalar.assert_called_once_with(
             "test1", torch.zeros(1), global_step=0
         )
         writer2.add_scalar.assert_called_once_with(
             "test2", torch.ones(1), global_step=0
         )
示例#19
0
 def __init__(
     self,
     key: str,
     category: str,
     title: str,
     actions: List[str],
     log_key_prefix: Optional[str] = None,
 ):
     super().__init__(key)
     self.log_key_prefix = log_key_prefix or f"{category}/{title}"
     self.actions = actions
     SummaryWriterContext.add_custom_scalars_multilinechart(
         [f"{self.log_key_prefix}/{action_name}/mean" for action_name in actions],
         category=category,
         title=title,
     )
示例#20
0
 def _sample_action(self, loc: torch.Tensor, scale_log: torch.Tensor):
     r = torch.randn_like(scale_log, device=scale_log.device)
     action = torch.tanh(loc + r * scale_log.exp())
     # Since each dim are independent, log-prob is simply sum
     log_prob = self.actor_network._log_prob(r, scale_log)
     squash_correction = self.actor_network._squash_correction(action)
     if SummaryWriterContext._global_step % 1000 == 0:
         SummaryWriterContext.add_histogram("actor/forward/loc",
                                            loc.detach().cpu())
         SummaryWriterContext.add_histogram("actor/forward/scale_log",
                                            scale_log.detach().cpu())
         SummaryWriterContext.add_histogram("actor/forward/log_prob",
                                            log_prob.detach().cpu())
         SummaryWriterContext.add_histogram(
             "actor/forward/squash_correction",
             squash_correction.detach().cpu())
     log_prob = torch.sum(log_prob - squash_correction, dim=1)
     return action, log_prob.reshape(-1, 1)
示例#21
0
 def test_global_step(self):
     with TemporaryDirectory() as tmp_dir:
         writer = SummaryWriter(tmp_dir)
         writer.add_scalar = MagicMock()
         with summary_writer_context(writer):
             SummaryWriterContext.add_scalar("test", torch.ones(1))
             SummaryWriterContext.increase_global_step()
             SummaryWriterContext.add_scalar("test", torch.zeros(1))
         writer.add_scalar.assert_has_calls([
             call("test", torch.ones(1), global_step=0),
             call("test", torch.zeros(1), global_step=1),
         ])
         self.assertEqual(2, len(writer.add_scalar.mock_calls))
示例#22
0
 def _log_prob(self, loc: torch.Tensor, scale_log: torch.Tensor,
               squashed_action: torch.Tensor):
     # This is not getting exported; we can use it
     n = torch.distributions.Normal(loc, scale_log.exp())
     raw_action = self.actor_network._atanh(squashed_action)
     log_prob = n.log_prob(raw_action)
     squash_correction = self.actor_network._squash_correction(
         squashed_action)
     if SummaryWriterContext._global_step % 1000 == 0:
         SummaryWriterContext.add_histogram("actor/get_log_prob/loc",
                                            loc.detach().cpu())
         SummaryWriterContext.add_histogram("actor/get_log_prob/scale_log",
                                            scale_log.detach().cpu())
         SummaryWriterContext.add_histogram("actor/get_log_prob/log_prob",
                                            log_prob.detach().cpu())
         SummaryWriterContext.add_histogram(
             "actor/get_log_prob/squash_correction",
             squash_correction.detach().cpu())
     log_prob = torch.sum(log_prob - squash_correction,
                          dim=1).reshape(-1, 1)
     return log_prob
示例#23
0
 def test_global_step(self):
     with TemporaryDirectory() as tmp_dir:
         writer = SummaryWriter(tmp_dir)
         writer.add_scalar = MagicMock()
         with summary_writer_context(writer):
             SummaryWriterContext.add_scalar("test", torch.ones(1))
             SummaryWriterContext.increase_global_step()
             SummaryWriterContext.add_scalar("test", torch.zeros(1))
         writer.add_scalar.assert_has_calls(
             [
                 call("test", torch.ones(1), global_step=0),
                 call("test", torch.zeros(1), global_step=1),
             ]
         )
         self.assertEqual(2, len(writer.add_scalar.mock_calls))
示例#24
0
 def tearDown(self):
     SummaryWriterContext._reset_globals()
示例#25
0
    def write_summary(self, actions: List[str]):
        if actions:
            for field, log_key in [
                ("logged_actions", "actions/logged"),
                ("model_action_idxs", "actions/model"),
            ]:
                val = getattr(self, field)
                if val is None:
                    continue
                for i, action in enumerate(actions):
                    SummaryWriterContext.add_scalar(
                        "{}/{}".format(log_key, action), (val == i).sum().item()
                    )

        for field, log_key in [
            ("td_loss", "td_loss"),
            ("reward_loss", "reward_loss"),
            ("logged_propensities", "propensities/logged"),
            ("logged_rewards", "reward/logged"),
            ("logged_values", "value/logged"),
            ("model_values_on_logged_actions", "value/model_logged_action"),
        ]:
            val = getattr(self, field)
            if val is None:
                continue
            assert len(val.shape) == 1 or (
                len(val.shape) == 2 and val.shape[1] == 1
            ), "Unexpected shape for {}: {}".format(field, val.shape)
            SummaryWriterContext.add_histogram(log_key, val)
            SummaryWriterContext.add_scalar("{}/mean".format(log_key), val.mean())

        for field, log_key in [
            ("model_propensities", "propensities/model"),
            ("model_rewards", "reward/model"),
            ("model_values", "value/model"),
        ]:
            val = getattr(self, field)
            if val is None:
                continue
            if (
                len(val.shape) == 1 or (len(val.shape) == 2 and val.shape[1] == 1)
            ) and not actions:
                SummaryWriterContext.add_histogram(log_key, val)
                SummaryWriterContext.add_scalar("{}/mean".format(log_key), val.mean())
            elif len(val.shape) == 2 and val.shape[1] == len(actions):
                for i, action in enumerate(actions):
                    SummaryWriterContext.add_histogram(
                        "{}/{}".format(log_key, action), val[:, i]
                    )
                    SummaryWriterContext.add_scalar(
                        "{}/{}/mean".format(log_key, action), val[:, i].mean()
                    )
            else:
                raise ValueError(
                    "Unexpected shape for {}: {}; actions: {}".format(
                        field, val.shape, actions
                    )
                )
示例#26
0
    def train(self, training_batch, evaluator=None) -> None:
        """
        IMPORTANT: the input action here is assumed to be preprocessed to match the
        range of the output of the actor.
        """
        if hasattr(training_batch, "as_parametric_sarsa_training_batch"):
            training_batch = training_batch.as_parametric_sarsa_training_batch(
            )

        learning_input = training_batch.training_input
        self.minibatch += 1

        state = learning_input.state
        action = learning_input.action
        reward = learning_input.reward
        discount = torch.full_like(reward, self.gamma)
        not_done_mask = learning_input.not_terminal

        if self._should_scale_action_in_train():
            action = rlt.FeatureVector(
                rescale_torch_tensor(
                    action.float_features,
                    new_min=self.min_action_range_tensor_training,
                    new_max=self.max_action_range_tensor_training,
                    prev_min=self.min_action_range_tensor_serving,
                    prev_max=self.max_action_range_tensor_serving,
                ))

        current_state_action = rlt.StateAction(state=state, action=action)

        q1_value = self.q1_network(current_state_action).q_value
        min_q_value = q1_value

        if self.q2_network:
            q2_value = self.q2_network(current_state_action).q_value
            min_q_value = torch.min(q1_value, q2_value)

        # Use the minimum as target, ensure no gradient going through
        min_q_value = min_q_value.detach()

        #
        # First, optimize value network; minimizing MSE between
        # V(s) & Q(s, a) - log(pi(a|s))
        #

        state_value = self.value_network(state.float_features)  # .q_value

        if self.logged_action_uniform_prior:
            log_prob_a = torch.zeros_like(min_q_value)
            target_value = min_q_value
        else:
            with torch.no_grad():
                log_prob_a = self.actor_network.get_log_prob(
                    state, action.float_features)
                log_prob_a = log_prob_a.clamp(-20.0, 20.0)
                target_value = min_q_value - self.entropy_temperature * log_prob_a

        value_loss = F.mse_loss(state_value, target_value)
        self.value_network_optimizer.zero_grad()
        value_loss.backward()
        self.value_network_optimizer.step()

        #
        # Second, optimize Q networks; minimizing MSE between
        # Q(s, a) & r + discount * V'(next_s)
        #

        with torch.no_grad():
            next_state_value = (self.value_network_target(
                learning_input.next_state.float_features) * not_done_mask)

            if self.minibatch < self.reward_burnin:
                target_q_value = reward
            else:
                target_q_value = reward + discount * next_state_value

        q1_loss = F.mse_loss(q1_value, target_q_value)
        self.q1_network_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_network_optimizer.step()
        if self.q2_network:
            q2_loss = F.mse_loss(q2_value, target_q_value)
            self.q2_network_optimizer.zero_grad()
            q2_loss.backward()
            self.q2_network_optimizer.step()

        #
        # Lastly, optimize the actor; minimizing KL-divergence between action propensity
        # & softmax of value. Due to reparameterization trick, it ends up being
        # log_prob(actor_action) - Q(s, actor_action)
        #

        actor_output = self.actor_network(rlt.StateInput(state=state))

        state_actor_action = rlt.StateAction(
            state=state,
            action=rlt.FeatureVector(float_features=actor_output.action))
        q1_actor_value = self.q1_network(state_actor_action).q_value
        min_q_actor_value = q1_actor_value
        if self.q2_network:
            q2_actor_value = self.q2_network(state_actor_action).q_value
            min_q_actor_value = torch.min(q1_actor_value, q2_actor_value)

        actor_loss = (self.entropy_temperature * actor_output.log_prob -
                      min_q_actor_value)
        # Do this in 2 steps so we can log histogram of actor loss
        actor_loss_mean = actor_loss.mean()
        self.actor_network_optimizer.zero_grad()
        actor_loss_mean.backward()
        self.actor_network_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.value_network, self.value_network_target,
                              1.0)
        else:
            # Use the soft update rule to update both target networks
            self._soft_update(self.value_network, self.value_network_target,
                              self.tau)

        # Logging at the end to schedule all the cuda operations first
        if (self.tensorboard_logging_freq is not None
                and self.minibatch % self.tensorboard_logging_freq == 0):
            SummaryWriterContext.add_histogram("q1/logged_state_value",
                                               q1_value)
            if self.q2_network:
                SummaryWriterContext.add_histogram("q2/logged_state_value",
                                                   q2_value)

            SummaryWriterContext.add_histogram("log_prob_a", log_prob_a)
            SummaryWriterContext.add_histogram("value_network/target",
                                               target_value)
            SummaryWriterContext.add_histogram("q_network/next_state_value",
                                               next_state_value)
            SummaryWriterContext.add_histogram("q_network/target_q_value",
                                               target_q_value)
            SummaryWriterContext.add_histogram("actor/min_q_actor_value",
                                               min_q_actor_value)
            SummaryWriterContext.add_histogram("actor/action_log_prob",
                                               actor_output.log_prob)
            SummaryWriterContext.add_histogram("actor/loss", actor_loss)

        if evaluator is not None:
            cpe_stats = BatchStatsForCPE(
                td_loss=q1_loss.detach().cpu().numpy(),
                logged_rewards=reward.detach().cpu().numpy(),
                model_values_on_logged_actions=q1_value.detach().cpu().numpy(),
                model_propensities=actor_output.log_prob.exp().detach().cpu().
                numpy(),
                model_values=min_q_actor_value.detach().cpu().numpy(),
            )
            evaluator.report(cpe_stats)
示例#27
0
    def train(self, training_batch) -> None:
        """
        IMPORTANT: the input action here is assumed to be preprocessed to match the
        range of the output of the actor.
        """
        if hasattr(training_batch, "as_parametric_sarsa_training_batch"):
            training_batch = training_batch.as_parametric_sarsa_training_batch()

        learning_input = training_batch.training_input
        self.minibatch += 1

        state = learning_input.state
        action = learning_input.action
        reward = learning_input.reward
        discount = torch.full_like(reward, self.gamma)
        not_done_mask = learning_input.not_terminal

        if self._should_scale_action_in_train():
            action = rlt.FeatureVector(
                rescale_torch_tensor(
                    action.float_features,
                    new_min=self.min_action_range_tensor_training,
                    new_max=self.max_action_range_tensor_training,
                    prev_min=self.min_action_range_tensor_serving,
                    prev_max=self.max_action_range_tensor_serving,
                )
            )

        current_state_action = rlt.StateAction(state=state, action=action)

        q1_value = self.q1_network(current_state_action).q_value
        min_q_value = q1_value

        if self.q2_network:
            q2_value = self.q2_network(current_state_action).q_value
            min_q_value = torch.min(q1_value, q2_value)

        # Use the minimum as target, ensure no gradient going through
        min_q_value = min_q_value.detach()

        #
        # First, optimize value network; minimizing MSE between
        # V(s) & Q(s, a) - log(pi(a|s))
        #

        state_value = self.value_network(state.float_features)  # .q_value

        if self.logged_action_uniform_prior:
            log_prob_a = torch.zeros_like(min_q_value)
            target_value = min_q_value
        else:
            with torch.no_grad():
                log_prob_a = self.actor_network.get_log_prob(
                    state, action.float_features
                )
                log_prob_a = log_prob_a.clamp(-20.0, 20.0)
                target_value = min_q_value - self.entropy_temperature * log_prob_a

        value_loss = F.mse_loss(state_value, target_value)
        self.value_network_optimizer.zero_grad()
        value_loss.backward()
        self.value_network_optimizer.step()

        #
        # Second, optimize Q networks; minimizing MSE between
        # Q(s, a) & r + discount * V'(next_s)
        #

        with torch.no_grad():
            next_state_value = (
                self.value_network_target(learning_input.next_state.float_features)
                * not_done_mask.float()
            )

            if self.minibatch < self.reward_burnin:
                target_q_value = reward
            else:
                target_q_value = reward + discount * next_state_value

        q1_loss = F.mse_loss(q1_value, target_q_value)
        self.q1_network_optimizer.zero_grad()
        q1_loss.backward()
        self.q1_network_optimizer.step()
        if self.q2_network:
            q2_loss = F.mse_loss(q2_value, target_q_value)
            self.q2_network_optimizer.zero_grad()
            q2_loss.backward()
            self.q2_network_optimizer.step()

        #
        # Lastly, optimize the actor; minimizing KL-divergence between action propensity
        # & softmax of value. Due to reparameterization trick, it ends up being
        # log_prob(actor_action) - Q(s, actor_action)
        #

        actor_output = self.actor_network(rlt.StateInput(state=state))

        state_actor_action = rlt.StateAction(
            state=state, action=rlt.FeatureVector(float_features=actor_output.action)
        )
        q1_actor_value = self.q1_network(state_actor_action).q_value
        min_q_actor_value = q1_actor_value
        if self.q2_network:
            q2_actor_value = self.q2_network(state_actor_action).q_value
            min_q_actor_value = torch.min(q1_actor_value, q2_actor_value)

        actor_loss = (
            self.entropy_temperature * actor_output.log_prob - min_q_actor_value
        )
        # Do this in 2 steps so we can log histogram of actor loss
        actor_loss_mean = actor_loss.mean()
        self.actor_network_optimizer.zero_grad()
        actor_loss_mean.backward()
        self.actor_network_optimizer.step()

        if self.minibatch < self.reward_burnin:
            # Reward burnin: force target network
            self._soft_update(self.value_network, self.value_network_target, 1.0)
        else:
            # Use the soft update rule to update both target networks
            self._soft_update(self.value_network, self.value_network_target, self.tau)

        # Logging at the end to schedule all the cuda operations first
        if (
            self.tensorboard_logging_freq is not None
            and self.minibatch % self.tensorboard_logging_freq == 0
        ):
            SummaryWriterContext.add_histogram("q1/logged_state_value", q1_value)
            if self.q2_network:
                SummaryWriterContext.add_histogram("q2/logged_state_value", q2_value)

            SummaryWriterContext.add_histogram("log_prob_a", log_prob_a)
            SummaryWriterContext.add_histogram("value_network/target", target_value)
            SummaryWriterContext.add_histogram(
                "q_network/next_state_value", next_state_value
            )
            SummaryWriterContext.add_histogram(
                "q_network/target_q_value", target_q_value
            )
            SummaryWriterContext.add_histogram(
                "actor/min_q_actor_value", min_q_actor_value
            )
            SummaryWriterContext.add_histogram(
                "actor/action_log_prob", actor_output.log_prob
            )
            SummaryWriterContext.add_histogram("actor/loss", actor_loss)

        self.loss_reporter.report(
            td_loss=float(q1_loss),
            reward_loss=None,
            logged_rewards=reward,
            model_values_on_logged_actions=q1_value,
            model_propensities=actor_output.log_prob.exp(),
            model_values=min_q_actor_value,
        )
示例#28
0
 def setUp(self):
     logging.getLogger().setLevel(logging.INFO)
     SummaryWriterContext._reset_globals()
     np.random.seed(SEED)
     torch.manual_seed(SEED)
     random.seed(SEED)
示例#29
0
    def train(self, training_batch) -> None:
        """
        IMPORTANT: the input action here is assumed to be preprocessed to match the
        range of the output of the actor.
        """
        if hasattr(training_batch, "as_parametric_sarsa_training_batch"):
            training_batch = training_batch.as_parametric_sarsa_training_batch(
            )

        learning_input = training_batch.training_input
        self.minibatch += 1

        state = learning_input.state
        action = learning_input.action
        reward = learning_input.reward
        discount = torch.full_like(reward, self.gamma)
        not_done_mask = learning_input.not_terminal

        if self._should_scale_action_in_train():
            action = rlt.FeatureVector(
                rescale_torch_tensor(
                    action.float_features,
                    new_min=self.min_action_range_tensor_training,
                    new_max=self.max_action_range_tensor_training,
                    prev_min=self.min_action_range_tensor_serving,
                    prev_max=self.max_action_range_tensor_serving,
                ))

        with torch.enable_grad():
            #
            # First, optimize Q networks; minimizing MSE between
            # Q(s, a) & r + discount * V'(next_s)
            #

            current_state_action = rlt.StateAction(state=state, action=action)
            q1_value = self.q1_network(current_state_action).q_value
            if self.q2_network:
                q2_value = self.q2_network(current_state_action).q_value
            actor_output = self.actor_network(rlt.StateInput(state=state))

            # Optimize Alpha
            if self.alpha_optimizer is not None:
                alpha_loss = -(self.log_alpha *
                               (actor_output.log_prob +
                                self.target_entropy).detach()).mean()
                self.alpha_optimizer.zero_grad()
                alpha_loss.backward()
                self.alpha_optimizer.step()
                self.entropy_temperature = self.log_alpha.exp()

            with torch.no_grad():
                if self.value_network is not None:
                    next_state_value = self.value_network_target(
                        learning_input.next_state.float_features)
                else:
                    next_state_actor_output = self.actor_network(
                        rlt.StateInput(state=learning_input.next_state))
                    next_state_actor_action = rlt.StateAction(
                        state=learning_input.next_state,
                        action=rlt.FeatureVector(
                            float_features=next_state_actor_output.action),
                    )
                    next_state_value = self.q1_network_target(
                        next_state_actor_action).q_value

                    if self.q2_network is not None:
                        target_q2_value = self.q2_network_target(
                            next_state_actor_action).q_value
                        next_state_value = torch.min(next_state_value,
                                                     target_q2_value)

                    log_prob_a = self.actor_network.get_log_prob(
                        learning_input.next_state,
                        next_state_actor_output.action)
                    log_prob_a = log_prob_a.clamp(-20.0, 20.0)
                    next_state_value -= self.entropy_temperature * log_prob_a

                target_q_value = (
                    reward +
                    discount * next_state_value * not_done_mask.float())

            q1_loss = F.mse_loss(q1_value, target_q_value)
            q1_loss.backward()
            self._maybe_run_optimizer(self.q1_network_optimizer,
                                      self.minibatches_per_step)
            if self.q2_network:
                q2_loss = F.mse_loss(q2_value, target_q_value)
                q2_loss.backward()
                self._maybe_run_optimizer(self.q2_network_optimizer,
                                          self.minibatches_per_step)

            #
            # Second, optimize the actor; minimizing KL-divergence between action propensity
            # & softmax of value. Due to reparameterization trick, it ends up being
            # log_prob(actor_action) - Q(s, actor_action)
            #

            state_actor_action = rlt.StateAction(
                state=state,
                action=rlt.FeatureVector(float_features=actor_output.action),
            )
            q1_actor_value = self.q1_network(state_actor_action).q_value
            min_q_actor_value = q1_actor_value
            if self.q2_network:
                q2_actor_value = self.q2_network(state_actor_action).q_value
                min_q_actor_value = torch.min(q1_actor_value, q2_actor_value)

            actor_loss = (self.entropy_temperature * actor_output.log_prob -
                          min_q_actor_value)
            # Do this in 2 steps so we can log histogram of actor loss
            actor_loss_mean = actor_loss.mean()
            actor_loss_mean.backward()
            self._maybe_run_optimizer(self.actor_network_optimizer,
                                      self.minibatches_per_step)

            #
            # Lastly, if applicable, optimize value network; minimizing MSE between
            # V(s) & E_a~pi(s) [ Q(s,a) - log(pi(a|s)) ]
            #

            if self.value_network is not None:
                state_value = self.value_network(state.float_features)

                if self.logged_action_uniform_prior:
                    log_prob_a = torch.zeros_like(min_q_actor_value)
                    target_value = min_q_actor_value
                else:
                    with torch.no_grad():
                        log_prob_a = actor_output.log_prob
                        log_prob_a = log_prob_a.clamp(-20.0, 20.0)
                        target_value = (min_q_actor_value -
                                        self.entropy_temperature * log_prob_a)

                value_loss = F.mse_loss(state_value, target_value.detach())
                value_loss.backward()
                self._maybe_run_optimizer(self.value_network_optimizer,
                                          self.minibatches_per_step)

        # Use the soft update rule to update the target networks
        if self.value_network is not None:
            self._maybe_soft_update(
                self.value_network,
                self.value_network_target,
                self.tau,
                self.minibatches_per_step,
            )
        else:
            self._maybe_soft_update(
                self.q1_network,
                self.q1_network_target,
                self.tau,
                self.minibatches_per_step,
            )
            if self.q2_network is not None:
                self._maybe_soft_update(
                    self.q2_network,
                    self.q2_network_target,
                    self.tau,
                    self.minibatches_per_step,
                )

        # Logging at the end to schedule all the cuda operations first
        if (self.tensorboard_logging_freq is not None
                and self.minibatch % self.tensorboard_logging_freq == 0):
            SummaryWriterContext.add_histogram("q1/logged_state_value",
                                               q1_value)
            if self.q2_network:
                SummaryWriterContext.add_histogram("q2/logged_state_value",
                                                   q2_value)

            SummaryWriterContext.add_histogram("log_prob_a", log_prob_a)
            if self.value_network:
                SummaryWriterContext.add_histogram("value_network/target",
                                                   target_value)

            SummaryWriterContext.add_histogram("q_network/next_state_value",
                                               next_state_value)
            SummaryWriterContext.add_histogram("q_network/target_q_value",
                                               target_q_value)
            SummaryWriterContext.add_histogram("actor/min_q_actor_value",
                                               min_q_actor_value)
            SummaryWriterContext.add_histogram("actor/action_log_prob",
                                               actor_output.log_prob)
            SummaryWriterContext.add_histogram("actor/loss", actor_loss)

        self.loss_reporter.report(
            td_loss=float(q1_loss),
            reward_loss=None,
            logged_rewards=reward,
            model_values_on_logged_actions=q1_value,
            model_propensities=actor_output.log_prob.exp(),
            model_values=min_q_actor_value,
        )
示例#30
0
 def setUp(self):
     SummaryWriterContext._reset_globals()
示例#31
0
    def dist(self, input: rlt.PreprocessedState):
        state = input.state.float_features

        x = state
        for i, activation in enumerate(self.activations[:-1]):
            if self.use_batch_norm:
                x = self.batch_norm_ops[i](x)

            x = self.layers[i](x)
            if activation == "linear":
                continue
            elif activation == "tanh":
                activation_func = torch.tanh
            else:
                activation_func = getattr(F, activation)
            x = activation_func(x)

        value = self.value(x).unsqueeze(dim=1)
        raw_advantage = self.advantage(x).reshape(-1, self.num_actions,
                                                  self.num_atoms)
        advantage = raw_advantage - raw_advantage.mean(dim=1, keepdim=True)

        q_value = value + advantage

        if SummaryWriterContext._global_step % 1000 == 0:
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/value".format(self._name),
                value.detach().mean(dim=2).cpu(),
            )
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_value".format(self._name),
                value.detach().mean().cpu(),
            )
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/q_value".format(self._name),
                q_value.detach().mean(dim=2).cpu(),
            )
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_q_value".format(self._name),
                q_value.detach().mean().cpu(),
            )
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/raw_advantage".format(self._name),
                raw_advantage.detach().mean(dim=2).cpu(),
            )
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_raw_advantage".format(self._name),
                raw_advantage.detach().mean().cpu(),
            )
            for i in range(advantage.shape[1]):
                a = advantage.detach()[:, i, :].mean(dim=1)
                SummaryWriterContext.add_histogram(
                    "dueling_network/{}/advantage/{}".format(self._name, i),
                    a.cpu())
                SummaryWriterContext.add_scalar(
                    "dueling_network/{}/mean_advantage/{}".format(
                        self._name, i),
                    a.mean().cpu(),
                )

        return q_value
示例#32
0
 def test_with_none(self):
     with summary_writer_context(None):
         self.assertIsNone(
             SummaryWriterContext.add_scalar("test", torch.ones(1)))
示例#33
0
    def forward(self,
                input) -> Union[NamedTuple, torch.FloatTensor]:  # type: ignore
        output_tensor = False
        if self.parametric_action:
            state = input.state.float_features
            action = input.action.float_features
        else:
            state = input.state.float_features
            action = None

        x = state
        for i, activation in enumerate(self.activations[:-1]):
            if self.use_batch_norm:
                x = self.batch_norm_ops[i](x)

            x = self.layers[i](x)
            if activation == "linear":
                continue
            elif activation == "tanh":
                activation_func = torch.tanh
            else:
                activation_func = getattr(F, activation)
            x = activation_func(x)

        value = self.value(x)
        if action is not None:
            x = torch.cat((x, action), dim=1)
        raw_advantage = self.advantage(x)
        if self.parametric_action:
            advantage = raw_advantage
        else:
            advantage = raw_advantage - raw_advantage.mean(dim=1, keepdim=True)

        q_value = value + advantage

        if SummaryWriterContext._global_step % 1000 == 0:
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/value".format(self._name),
                value.detach().cpu())
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_value".format(self._name),
                value.detach().mean().cpu(),
            )
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/q_value".format(self._name),
                q_value.detach().cpu())
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_q_value".format(self._name),
                q_value.detach().mean().cpu(),
            )
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/raw_advantage".format(self._name),
                raw_advantage.detach().cpu(),
            )
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_raw_advantage".format(self._name),
                raw_advantage.detach().mean().cpu(),
            )
            if not self.parametric_action:
                for i in range(advantage.shape[1]):
                    a = advantage.detach()[:, i]
                    SummaryWriterContext.add_histogram(
                        "dueling_network/{}/advantage/{}".format(
                            self._name, i), a.cpu())
                    SummaryWriterContext.add_scalar(
                        "dueling_network/{}/mean_advantage/{}".format(
                            self._name, i),
                        a.mean().cpu(),
                    )

        if output_tensor:
            return q_value  # type: ignore
        elif self.parametric_action:
            return rlt.SingleQValue(q_value=q_value)  # type: ignore
        else:
            return rlt.AllActionQValues(q_values=q_value)  # type: ignore
示例#34
0
    def train(self, training_batch) -> None:
        """
        IMPORTANT: the input action here is assumed to be preprocessed to match the
        range of the output of the actor.
        """
        if hasattr(training_batch, "as_parametric_sarsa_training_batch"):
            training_batch = training_batch.as_parametric_sarsa_training_batch(
            )

        learning_input = training_batch.training_input
        self.minibatch += 1

        state = learning_input.state
        action = learning_input.action
        next_state = learning_input.next_state
        reward = learning_input.reward
        not_done_mask = learning_input.not_terminal

        action = self._maybe_scale_action_in_train(action)

        # Compute current value estimates
        current_state_action = rlt.StateAction(state=state, action=action)
        q1_value = self.q1_network(current_state_action).q_value
        if self.q2_network:
            q2_value = self.q2_network(current_state_action).q_value
        actor_action = self.actor_network(rlt.StateInput(state=state)).action

        # Generate target = r + y * min (Q1(s',pi(s')), Q2(s',pi(s')))
        with torch.no_grad():
            next_actor = self.actor_network_target(
                rlt.StateInput(state=next_state)).action
            next_actor += (torch.randn_like(next_actor) *
                           self.target_policy_smoothing).clamp(
                               -self.noise_clip, self.noise_clip)
            next_actor = torch.max(
                torch.min(next_actor, self.max_action_range_tensor_training),
                self.min_action_range_tensor_training,
            )
            next_state_actor = rlt.StateAction(
                state=next_state,
                action=rlt.FeatureVector(float_features=next_actor))
            next_state_value = self.q1_network_target(next_state_actor).q_value

            if self.q2_network is not None:
                next_state_value = torch.min(
                    next_state_value,
                    self.q2_network_target(next_state_actor).q_value)

            target_q_value = (
                reward + self.gamma * next_state_value * not_done_mask.float())

        # Optimize Q1 and Q2
        q1_loss = F.mse_loss(q1_value, target_q_value)
        q1_loss.backward()
        self._maybe_run_optimizer(self.q1_network_optimizer,
                                  self.minibatches_per_step)
        if self.q2_network:
            q2_loss = F.mse_loss(q2_value, target_q_value)
            q2_loss.backward()
            self._maybe_run_optimizer(self.q2_network_optimizer,
                                      self.minibatches_per_step)

        # Only update actor and target networks after a fixed number of Q updates
        if self.minibatch % self.delayed_policy_update == 0:
            actor_loss = -self.q1_network(
                rlt.StateAction(
                    state=state,
                    action=rlt.FeatureVector(
                        float_features=actor_action))).q_value.mean()
            actor_loss.backward()
            self._maybe_run_optimizer(self.actor_network_optimizer,
                                      self.minibatches_per_step)

            # Use the soft update rule to update the target networks
            self._maybe_soft_update(
                self.q1_network,
                self.q1_network_target,
                self.tau,
                self.minibatches_per_step,
            )
            self._maybe_soft_update(
                self.actor_network,
                self.actor_network_target,
                self.tau,
                self.minibatches_per_step,
            )
            if self.q2_network is not None:
                self._maybe_soft_update(
                    self.q2_network,
                    self.q2_network_target,
                    self.tau,
                    self.minibatches_per_step,
                )

        # Logging at the end to schedule all the cuda operations first
        if (self.tensorboard_logging_freq is not None
                and self.minibatch % self.tensorboard_logging_freq == 0):
            SummaryWriterContext.add_histogram("q1/logged_state_value",
                                               q1_value)
            if self.q2_network:
                SummaryWriterContext.add_histogram("q2/logged_state_value",
                                                   q2_value)

            SummaryWriterContext.add_histogram("q_network/next_state_value",
                                               next_state_value)
            SummaryWriterContext.add_histogram("q_network/target_q_value",
                                               target_q_value)
            SummaryWriterContext.add_histogram("actor/loss", actor_loss)

        self.loss_reporter.report(
            td_loss=float(q1_loss),
            reward_loss=None,
            logged_rewards=reward,
            model_values_on_logged_actions=q1_value,
        )
示例#35
0
 def handle(self, tdp: TrainingDataPage) -> None:
     SummaryWriterContext.increase_global_step()
     self.trainer.train(tdp)
示例#36
0
 def test_with_none(self):
     with summary_writer_context(None):
         self.assertIsNone(SummaryWriterContext.add_scalar("test", torch.ones(1)))
示例#37
0
    def evaluate_batch(self):
        merged_inputs = []
        for batch in self.all_batches:
            if len(batch) > 0:
                merged_inputs.append(np.vstack(batch))
            else:
                merged_inputs.append(None)

        (
            td_loss,
            logged_actions,
            logged_propensities,
            logged_rewards,
            logged_values,
            model_propensities,
            model_rewards,
            model_values,
            model_values_on_logged_actions,
            model_action_idxs,
        ) = merged_inputs

        logger.info("Evaluating on {} batches".format(len(
            self.td_loss_batches)))
        print_details = "Evaluator:\n"
        if td_loss is not None:
            SummaryWriterContext.add_histogram("td_loss", td_loss)
            td_loss_mean = float(np.mean(td_loss))
            SummaryWriterContext.add_scalar("td_loss/mean", td_loss_mean)
            self.td_loss.append(td_loss_mean)
            print_details = print_details + "TD LOSS: {0:.3f}\n".format(
                td_loss_mean)

        if logged_rewards is not None:
            SummaryWriterContext.add_histogram("reward/logged", logged_rewards)
            SummaryWriterContext.add_scalar("reward/logged/mean",
                                            logged_rewards.mean())

        if model_rewards is not None:
            SummaryWriterContext.add_histogram("reward/model", model_rewards)
            SummaryWriterContext.add_scalar("reward/model/mean",
                                            model_rewards.mean())

        if logged_values is not None:
            SummaryWriterContext.add_histogram("value/logged", logged_values)
            SummaryWriterContext.add_scalar("value/logged/mean",
                                            logged_values.mean())

        if model_values is not None:
            SummaryWriterContext.add_histogram("value/model", model_values)
            SummaryWriterContext.add_scalar("value/model/mean",
                                            model_values.mean())

        if model_values_on_logged_actions is not None:
            SummaryWriterContext.add_histogram("value/model_logged_action",
                                               model_values_on_logged_actions)

        # TODO: log summary of logged propensities

        if model_propensities is not None and self.action_names:
            if len(model_propensities.shape) == 1:
                SummaryWriterContext.add_histogram("propensities/model",
                                                   model_propensities)
                SummaryWriterContext.add_scalar("propensities/model/mean",
                                                model_propensities.mean())
            if len(model_propensities.shape) == 2:
                for i, action_name in enumerate(self.action_names):
                    SummaryWriterContext.add_histogram(
                        "propensities/model/{}".format(action_name),
                        model_propensities[:, i],
                    )
                    SummaryWriterContext.add_scalar(
                        "propensities/model/{}/mean".format(action_name),
                        model_propensities[:, i].mean(),
                    )

        if logged_actions is not None and model_action_idxs is not None:
            logged_action_distr, logged_action_counts = self._get_batch_logged_actions(
                [logged_actions])
            model_action_distr, model_action_counts = self._get_batch_model_actions(
                [model_action_idxs])
            print_details += "The distribution of logged actions : {}\n".format(
                logged_action_counts)
            print_details += "The distribution of model actions : {}\n".format(
                model_action_counts)
            for action, count in logged_action_counts.items():
                self.logged_action_counts[action] += count

            for action, count in model_action_counts.items():
                self.model_action_counts[action].append(count)
                self.model_action_counts_cumulative[action] += count

            for action, val in model_action_distr.items():
                self.model_action_distr[action].append(val)

            # Log to tensorboard
            for action_name, count in logged_action_counts.items():
                SummaryWriterContext.add_scalar(
                    "actions/logged/{}".format(action_name), count)
            for action_name, count in model_action_counts.items():
                SummaryWriterContext.add_scalar(
                    "actions/model/{}".format(action_name), count)

        print_details += "Batch Evaluator Finished"
        for print_detail in print_details.split("\n"):
            logger.info(print_detail)
示例#38
0
 def test_noop(self):
     self.assertIsNone(SummaryWriterContext.add_scalar("test", torch.ones(1)))
示例#39
0
 def test_noop(self):
     self.assertIsNone(
         SummaryWriterContext.add_scalar("test", torch.ones(1)))
示例#40
0
 def tearDown(self):
     SummaryWriterContext._reset_globals()
示例#41
0
 def handle(self, tdp: PreprocessedTrainingBatch) -> None:
     SummaryWriterContext.increase_global_step()
     self.trainer_or_evaluator.train(tdp)
示例#42
0
 def setUp(self):
     SummaryWriterContext._reset_globals()
示例#43
0
 def aggregate(self, values):
     for i, action in enumerate(self.actions):
         SummaryWriterContext.add_scalar(
             f"{self.log_key}/{action}", (values == i).sum().item()
         )
示例#44
0
    def forward(self, input) -> torch.FloatTensor:
        state_dim = self.layers[0].in_features
        state = input[:, :state_dim]
        action = input[:, state_dim:]

        x = state
        for i, activation in enumerate(self.activations[:-1]):
            if self.use_batch_norm:
                x = self.batch_norm_ops[i](x)
            activation_func = getattr(F, activation)
            fc_func = self.layers[i]
            x = fc_func(x) if activation == "linear" else activation_func(fc_func(x))

        value = self.value(x)
        x = torch.cat((x, action), dim=1)
        raw_advantage = self.advantage(x)
        if self.parametric_action:
            advantage = raw_advantage
        else:
            advantage = raw_advantage - raw_advantage.mean(dim=1, keepdim=True)

        q_value = value + advantage

        if SummaryWriterContext._global_step % 1000 == 0:
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/value".format(self._name), value.detach().cpu()
            )
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_value".format(self._name),
                value.detach().mean().cpu(),
            )
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/q_value".format(self._name), q_value.detach().cpu()
            )
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_q_value".format(self._name),
                q_value.detach().mean().cpu(),
            )
            SummaryWriterContext.add_histogram(
                "dueling_network/{}/raw_advantage".format(self._name),
                raw_advantage.detach().cpu(),
            )
            SummaryWriterContext.add_scalar(
                "dueling_network/{}/mean_raw_advantage".format(self._name),
                raw_advantage.detach().mean().cpu(),
            )
            if not self.parametric_action:
                for i in range(advantage.shape[1]):
                    a = advantage.detach()[:, i]
                    SummaryWriterContext.add_histogram(
                        "dueling_network/{}/advatage/{}".format(self._name, i), a.cpu()
                    )
                    SummaryWriterContext.add_scalar(
                        "dueling_network/{}/mean_advatage/{}".format(self._name, i),
                        a.mean().cpu(),
                    )

        return q_value