def configure_optimizers(self): optimizers = [] optimizers.append( self.q_network_optimizer.make_optimizer_scheduler( self.q_network.parameters())) if self.calc_cpe_in_training: optimizers.append( self.reward_network_optimizer.make_optimizer_scheduler( self.reward_network.parameters())) optimizers.append( self.q_network_cpe_optimizer.make_optimizer_scheduler( self.q_network_cpe.parameters())) # soft-update target_params = list(self.q_network_target.parameters()) source_params = list(self.q_network.parameters()) if self.calc_cpe_in_training: target_params += list(self.q_network_cpe_target.parameters()) source_params += list(self.q_network_cpe.parameters()) optimizers.append( SoftUpdate.make_optimizer_scheduler(target_params, source_params, tau=self.tau)) return optimizers
def configure_optimizers(self): optimizers = [] optimizers.append( self.q_network_optimizer.make_optimizer(self.q1_network.parameters()) ) if self.q2_network: optimizers.append( self.q_network_optimizer.make_optimizer(self.q2_network.parameters()) ) optimizers.append( self.actor_network_optimizer.make_optimizer(self.actor_network.parameters()) ) if self.alpha_optimizer is not None: optimizers.append(self.alpha_optimizer.make_optimizer([self.log_alpha])) if self.value_network: optimizers.append( self.value_network_optimizer.make_optimizer( self.value_network.parameters() ) ) # soft-update if self.value_network: target_params = self.value_network_target.parameters() source_params = self.value_network.parameters() else: target_params = list(self.q1_network_target.parameters()) source_params = list(self.q1_network.parameters()) if self.q2_network: target_params += list(self.q2_network_target.parameters()) source_params += list(self.q2_network.parameters()) optimizers.append(SoftUpdate(target_params, source_params, tau=self.tau)) return optimizers
def configure_optimizers(self): optimizers = [] target_params = list(self.q_network_target.parameters()) source_params = list(self.q_network.parameters()) optimizers.append( self.q_network_optimizer.make_optimizer_scheduler( self.q_network.parameters())) if self.calc_cpe_in_training: ( cpe_target_params, cpe_source_params, cpe_optimizers, ) = self._configure_cpe_optimizers() target_params += cpe_target_params source_params += cpe_source_params optimizers += cpe_optimizers optimizers.append( SoftUpdate.make_optimizer_scheduler(target_params, source_params, tau=self.tau)) return optimizers
def configure_optimizers(self): optimizers = [] optimizers.append( self.q_network_optimizer.make_optimizer( self.q1_network.parameters())) if self.q2_network: optimizers.append( self.q_network_optimizer.make_optimizer( self.q2_network.parameters())) optimizers.append( self.actor_network_optimizer.make_optimizer( self.actor_network.parameters())) # soft-update target_params = list(self.q1_network_target.parameters()) source_params = list(self.q1_network.parameters()) if self.q2_network: target_params += list(self.q2_network_target.parameters()) source_params += list(self.q2_network.parameters()) target_params += list(self.actor_network_target.parameters()) source_params += list(self.actor_network.parameters()) optimizers.append( SoftUpdate(target_params, source_params, tau=self.tau)) return optimizers
def configure_optimizers(self): optimizers = [ self.q_network_optimizer.make_optimizer(self.q_network.parameters()) ] # soft-update target_params = list(self.q_network_target.parameters()) source_params = list(self.q_network.parameters()) optimizers.append(SoftUpdate(target_params, source_params, tau=self.tau)) return optimizers
def configure_optimizers(self): optimizers = [] optimizers.append( self.q_network_optimizer.make_optimizer_scheduler( self.q_network.parameters() ) ) target_params = list(self.q_network_target.parameters()) source_params = list(self.q_network.parameters()) optimizers.append( SoftUpdate.make_optimizer_scheduler( target_params, source_params, tau=self.tau ) ) return optimizers
def __init__( self, q_network, q_network_target, reward_network, q_network_cpe=None, q_network_cpe_target=None, metrics_to_score=None, imitator=None, loss_reporter=None, use_gpu: bool = False, actions: List[str] = field(default_factory=list), # noqa: B008 rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 double_q_learning: bool = True, bcq: Optional[BCQConfig] = None, minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), evaluation: EvaluationParameters = field( # noqa: B008 default_factory=EvaluationParameters), ) -> None: super().__init__( rl, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=actions, evaluation_parameters=evaluation, loss_reporter=loss_reporter, ) assert self._actions is not None, "Discrete-action DQN needs action names" self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( q_network.parameters()) self.q_network_soft_update = SoftUpdate( self.q_network_target.parameters(), self.q_network.parameters(), self.tau) self._initialize_cpe(reward_network, q_network_cpe, q_network_cpe_target, optimizer=optimizer) # pyre-fixme[6]: Expected `Sized` for 1st param but got `Optional[List[str]]`. self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if rl.reward_boost is not None: # pyre-fixme[16]: `Optional` has no attribute `keys`. for k in rl.reward_boost.keys(): # pyre-fixme[16]: `Optional` has no attribute `index`. i = self._actions.index(k) # pyre-fixme[16]: `Optional` has no attribute `__getitem__`. self.reward_boosts[0, i] = rl.reward_boost[k] # Batch constrained q-learning self.bcq = bcq is not None if self.bcq: assert bcq is not None self.bcq_drop_threshold = bcq.drop_threshold self.bcq_imitator = imitator