Exemplo n.º 1
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        parameters: ContinuousActionModelParameters,
    ) -> None:
        self.double_q_learning = parameters.rainbow.double_q_learning
        self.minibatch_size = parameters.training.minibatch_size

        DQNTrainerBase.__init__(
            self,
            parameters,
            use_gpu=False,
            additional_feature_types=None,
            gradient_handler=None,
        )

        self.q_network = q_network
        self.q_network_target = q_network_target
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(),
            lr=parameters.training.learning_rate,
            weight_decay=parameters.training.l2_decay,
        )

        self.reward_network = reward_network
        self.reward_network_optimizer = self.optimizer_func(
            self.reward_network.parameters(),
            lr=parameters.training.learning_rate)
Exemplo n.º 2
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        parameters: ContinuousActionModelParameters,
        use_gpu: bool = False,
    ) -> None:
        DQNTrainerBase.__init__(self, parameters, use_gpu=use_gpu)

        self.double_q_learning = parameters.rainbow.double_q_learning
        self.minibatch_size = parameters.training.minibatch_size
        self.minibatches_per_step = parameters.training.minibatches_per_step or 1

        self.q_network = q_network
        self.q_network_target = q_network_target
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(),
            lr=parameters.training.learning_rate,
            weight_decay=parameters.training.l2_decay,
        )

        self.reward_network = reward_network
        self.reward_network_optimizer = self.optimizer_func(
            self.reward_network.parameters(),
            lr=parameters.training.learning_rate,
            weight_decay=parameters.training.l2_decay,
        )
Exemplo n.º 3
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        parameters: ContinuousActionModelParameters,
    ) -> None:
        self.double_q_learning = parameters.rainbow.double_q_learning
        self.minibatch_size = parameters.training.minibatch_size

        DQNTrainerBase.__init__(
            self,
            parameters,
            use_gpu=False,
            additional_feature_types=None,
            gradient_handler=None,
        )

        self.q_network = q_network
        self.q_network_target = q_network_target
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(),
            lr=parameters.training.learning_rate,
            weight_decay=parameters.training.l2_decay,
        )

        self.reward_network = reward_network
        self.reward_network_optimizer = self.optimizer_func(
            self.reward_network.parameters(), lr=parameters.training.learning_rate
        )
Exemplo n.º 4
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        parameters: DiscreteActionModelParameters,
        use_gpu=False,
        q_network_cpe=None,
        q_network_cpe_target=None,
        metrics_to_score=None,
    ) -> None:
        self.double_q_learning = parameters.rainbow.double_q_learning
        self.minibatch_size = parameters.training.minibatch_size
        self._actions = parameters.actions if parameters.actions is not None else []

        DQNTrainerBase.__init__(
            self,
            parameters,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            gradient_handler=None,
            actions=parameters.actions,
        )

        self.q_network = q_network
        self.q_network_target = q_network_target
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(),
            lr=parameters.training.learning_rate,
            weight_decay=parameters.training.l2_decay,
        )

        if self.calc_cpe_in_training:
            assert reward_network is not None, "reward_network is required for CPE"
            self.reward_network = reward_network
            self.reward_network_optimizer = self.optimizer_func(
                self.reward_network.parameters(),
                lr=parameters.training.learning_rate)
            assert (
                q_network_cpe is not None and q_network_cpe_target is not None
            ), "q_network_cpe and q_network_cpe_target are required for CPE"
            self.q_network_cpe = q_network_cpe
            self.q_network_cpe_target = q_network_cpe_target
            self.q_network_cpe_optimizer = self.optimizer_func(
                self.q_network_cpe.parameters(),
                lr=parameters.training.learning_rate)
            num_output_nodes = len(self.metrics_to_score) * self.num_actions
            self.reward_idx_offsets = torch.arange(
                0, num_output_nodes, self.num_actions).type(self.dtypelong)

        self.reward_boosts = torch.zeros([1,
                                          len(self._actions)]).type(self.dtype)
        if parameters.rl.reward_boost is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_boosts[0, i] = parameters.rl.reward_boost[k]
Exemplo n.º 5
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        parameters: DiscreteActionModelParameters,
        use_gpu=False,
    ) -> None:
        self.double_q_learning = parameters.rainbow.double_q_learning
        self.minibatch_size = parameters.training.minibatch_size
        self._actions = parameters.actions if parameters.actions is not None else []

        DQNTrainerBase.__init__(
            self,
            parameters,
            use_gpu=use_gpu,
            additional_feature_types=None,
            gradient_handler=None,
        )

        self.q_network = q_network
        self.q_network_target = q_network_target
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(),
            lr=parameters.training.learning_rate,
            weight_decay=parameters.training.l2_decay,
        )

        self.reward_network = reward_network
        self.reward_network_optimizer = self.optimizer_func(
            self.reward_network.parameters(),
            lr=parameters.training.learning_rate)

        self.reward_boosts = torch.zeros([1,
                                          len(self._actions)]).type(self.dtype)
        if parameters.rl.reward_boost is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_boosts[0, i] = parameters.rl.reward_boost[k]
Exemplo n.º 6
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        parameters: DiscreteActionModelParameters,
        use_gpu=False,
        q_network_cpe=None,
        q_network_cpe_target=None,
        metrics_to_score=None,
        imitator=None,
    ) -> None:
        DQNTrainerBase.__init__(
            self,
            parameters,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            actions=parameters.actions,
        )
        self.double_q_learning = parameters.rainbow.double_q_learning
        self.minibatch_size = parameters.training.minibatch_size
        self.minibatches_per_step = parameters.training.minibatches_per_step or 1
        self._actions = parameters.actions if parameters.actions is not None else []

        self.q_network = q_network
        self.q_network_target = q_network_target
        self._set_optimizer(parameters.training.optimizer)
        self.q_network_optimizer = self.optimizer_func(
            self.q_network.parameters(),
            lr=parameters.training.learning_rate,
            weight_decay=parameters.training.l2_decay,
        )

        if self.calc_cpe_in_training:
            assert reward_network is not None, "reward_network is required for CPE"
            self.reward_network = reward_network
            self.reward_network_optimizer = self.optimizer_func(
                self.reward_network.parameters(),
                lr=parameters.training.learning_rate)
            assert (
                q_network_cpe is not None and q_network_cpe_target is not None
            ), "q_network_cpe and q_network_cpe_target are required for CPE"
            self.q_network_cpe = q_network_cpe
            self.q_network_cpe_target = q_network_cpe_target
            self.q_network_cpe_optimizer = self.optimizer_func(
                self.q_network_cpe.parameters(),
                lr=parameters.training.learning_rate)
            num_output_nodes = len(self.metrics_to_score) * self.num_actions
            self.reward_idx_offsets = torch.arange(
                0,
                num_output_nodes,
                self.num_actions,
                device=self.device,
                dtype=torch.long,
            )
        else:
            self.reward_network = None

        self.reward_boosts = torch.zeros([1, len(self._actions)],
                                         device=self.device)
        if parameters.rl.reward_boost is not None:
            for k in parameters.rl.reward_boost.keys():
                i = self._actions.index(k)
                self.reward_boosts[0, i] = parameters.rl.reward_boost[k]

        # Batch constrained q-learning
        self.bcq = parameters.rainbow.bcq
        if self.bcq:
            self.bcq_drop_threshold = parameters.rainbow.bcq_drop_threshold
            self.bcq_imitator = imitator