def __init__( self, q_network, q_network_target, reward_network, parameters: ContinuousActionModelParameters, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.minibatch_size = parameters.training.minibatch_size DQNTrainerBase.__init__( self, parameters, use_gpu=False, additional_feature_types=None, gradient_handler=None, ) self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self.reward_network = reward_network self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate)
def __init__( self, q_network, q_network_target, reward_network, parameters: ContinuousActionModelParameters, use_gpu: bool = False, ) -> None: DQNTrainerBase.__init__(self, parameters, use_gpu=use_gpu) self.double_q_learning = parameters.rainbow.double_q_learning self.minibatch_size = parameters.training.minibatch_size self.minibatches_per_step = parameters.training.minibatches_per_step or 1 self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self.reward_network = reward_network self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, )
def __init__( self, q_network, q_network_target, reward_network, parameters: ContinuousActionModelParameters, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.minibatch_size = parameters.training.minibatch_size DQNTrainerBase.__init__( self, parameters, use_gpu=False, additional_feature_types=None, gradient_handler=None, ) self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self.reward_network = reward_network self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate )
def __init__( self, q_network, q_network_target, reward_network, parameters: DiscreteActionModelParameters, use_gpu=False, q_network_cpe=None, q_network_cpe_target=None, metrics_to_score=None, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.minibatch_size = parameters.training.minibatch_size self._actions = parameters.actions if parameters.actions is not None else [] DQNTrainerBase.__init__( self, parameters, use_gpu=use_gpu, metrics_to_score=metrics_to_score, gradient_handler=None, actions=parameters.actions, ) self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) if self.calc_cpe_in_training: assert reward_network is not None, "reward_network is required for CPE" self.reward_network = reward_network self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate) assert ( q_network_cpe is not None and q_network_cpe_target is not None ), "q_network_cpe and q_network_cpe_target are required for CPE" self.q_network_cpe = q_network_cpe self.q_network_cpe_target = q_network_cpe_target self.q_network_cpe_optimizer = self.optimizer_func( self.q_network_cpe.parameters(), lr=parameters.training.learning_rate) num_output_nodes = len(self.metrics_to_score) * self.num_actions self.reward_idx_offsets = torch.arange( 0, num_output_nodes, self.num_actions).type(self.dtypelong) self.reward_boosts = torch.zeros([1, len(self._actions)]).type(self.dtype) if parameters.rl.reward_boost is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_boosts[0, i] = parameters.rl.reward_boost[k]
def __init__( self, q_network, q_network_target, reward_network, parameters: DiscreteActionModelParameters, use_gpu=False, ) -> None: self.double_q_learning = parameters.rainbow.double_q_learning self.minibatch_size = parameters.training.minibatch_size self._actions = parameters.actions if parameters.actions is not None else [] DQNTrainerBase.__init__( self, parameters, use_gpu=use_gpu, additional_feature_types=None, gradient_handler=None, ) self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) self.reward_network = reward_network self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate) self.reward_boosts = torch.zeros([1, len(self._actions)]).type(self.dtype) if parameters.rl.reward_boost is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_boosts[0, i] = parameters.rl.reward_boost[k]
def __init__( self, q_network, q_network_target, reward_network, parameters: DiscreteActionModelParameters, use_gpu=False, q_network_cpe=None, q_network_cpe_target=None, metrics_to_score=None, imitator=None, ) -> None: DQNTrainerBase.__init__( self, parameters, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=parameters.actions, ) self.double_q_learning = parameters.rainbow.double_q_learning self.minibatch_size = parameters.training.minibatch_size self.minibatches_per_step = parameters.training.minibatches_per_step or 1 self._actions = parameters.actions if parameters.actions is not None else [] self.q_network = q_network self.q_network_target = q_network_target self._set_optimizer(parameters.training.optimizer) self.q_network_optimizer = self.optimizer_func( self.q_network.parameters(), lr=parameters.training.learning_rate, weight_decay=parameters.training.l2_decay, ) if self.calc_cpe_in_training: assert reward_network is not None, "reward_network is required for CPE" self.reward_network = reward_network self.reward_network_optimizer = self.optimizer_func( self.reward_network.parameters(), lr=parameters.training.learning_rate) assert ( q_network_cpe is not None and q_network_cpe_target is not None ), "q_network_cpe and q_network_cpe_target are required for CPE" self.q_network_cpe = q_network_cpe self.q_network_cpe_target = q_network_cpe_target self.q_network_cpe_optimizer = self.optimizer_func( self.q_network_cpe.parameters(), lr=parameters.training.learning_rate) num_output_nodes = len(self.metrics_to_score) * self.num_actions self.reward_idx_offsets = torch.arange( 0, num_output_nodes, self.num_actions, device=self.device, dtype=torch.long, ) else: self.reward_network = None self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if parameters.rl.reward_boost is not None: for k in parameters.rl.reward_boost.keys(): i = self._actions.index(k) self.reward_boosts[0, i] = parameters.rl.reward_boost[k] # Batch constrained q-learning self.bcq = parameters.rainbow.bcq if self.bcq: self.bcq_drop_threshold = parameters.rainbow.bcq_drop_threshold self.bcq_imitator = imitator