def _initialize_cpe( self, reward_network, q_network_cpe, q_network_cpe_target, optimizer: Optimizer__Union, ) -> None: if self.calc_cpe_in_training: assert reward_network is not None, "reward_network is required for CPE" # pyre-fixme[16]: `RLTrainer` has no attribute `reward_network`. self.reward_network = reward_network # pyre-fixme[16]: `RLTrainer` has no attribute `reward_network_optimizer`. self.reward_network_optimizer = optimizer.make_optimizer_scheduler( self.reward_network.parameters()) assert ( q_network_cpe is not None and q_network_cpe_target is not None ), "q_network_cpe and q_network_cpe_target are required for CPE" # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe`. self.q_network_cpe = q_network_cpe # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe_target`. self.q_network_cpe_target = q_network_cpe_target # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe_optimizer`. self.q_network_cpe_optimizer = optimizer.make_optimizer_scheduler( self.q_network_cpe.parameters()) num_output_nodes = len(self.metrics_to_score) * self.num_actions # pyre-fixme[16]: `RLTrainer` has no attribute `reward_idx_offsets`. self.reward_idx_offsets = torch.arange( 0, num_output_nodes, self.num_actions, device=self.device, dtype=torch.long, ) else: self.reward_network = None
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, minibatch_size: int = 1024, parameters: Seq2SlateParameters = field( # noqa: B008 default_factory=Seq2SlateParameters), baseline_net: Optional[BaselineNet] = None, baseline_warmup_num_batches: int = 0, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), baseline_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), policy_gradient_interval: int = 1, print_interval: int = 100, ) -> None: self.seq2slate_net = seq2slate_net self.parameters = parameters self.use_gpu = use_gpu self.policy_gradient_interval = policy_gradient_interval self.print_interval = print_interval self.minibatch_size = minibatch_size self.minibatch = 0 self.baseline_net = baseline_net self.baseline_warmup_num_batches = baseline_warmup_num_batches self.rl_opt = policy_optimizer.make_optimizer_scheduler( self.seq2slate_net.parameters())["optimizer"] self.rl_opt.zero_grad() if self.baseline_net: self.baseline_opt = baseline_optimizer.make_optimizer_scheduler( # pyre-fixme[16]: `Optional` has no attribute `parameters`. self.baseline_net.parameters())["optimizer"]
def __init__( self, imitator, use_gpu: bool = False, rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: super().__init__(rl, use_gpu=use_gpu) self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.imitator = imitator self.imitator_optimizer = optimizer.make_optimizer_scheduler( imitator.parameters())
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, parameters: Seq2SlateParameters, minibatch_size: int, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), print_interval: int = 100, ) -> None: self.parameters = parameters self.use_gpu = use_gpu self.print_interval = print_interval self.seq2slate_net = seq2slate_net self.minibatch_size = minibatch_size self.minibatch = 0 self.optimizer = policy_optimizer.make_optimizer_scheduler( self.seq2slate_net.parameters())["optimizer"] # TODO: T62269969 add baseline_net in training self.kl_div_loss = nn.KLDivLoss(reduction="none")
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, minibatch_size: int = 1024, loss_reporter=None, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: self.loss_reporter = loss_reporter self.use_gpu = use_gpu self.seq2slate_net = seq2slate_net self.minibatch_size = minibatch_size self.minibatch = 0 self.optimizer = policy_optimizer.make_optimizer_scheduler( self.seq2slate_net.parameters())["optimizer"] self.log_softmax = nn.LogSoftmax(dim=1) self.kl_loss = nn.KLDivLoss(reduction="batchmean") if self.loss_reporter is None: self.loss_reporter = NoOpLossReporter()
def __init__( self, reward_net: ModelBase, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default ), loss_type: LossFunction = LossFunction.MSE, reward_ignore_threshold: Optional[float] = None, weighted_by_inverse_propensity: bool = False, ) -> None: self.reward_net = reward_net self.minibatch = 0 self.opt = optimizer.make_optimizer_scheduler(self.reward_net.parameters())[ "optimizer" ] self.loss_type = loss_type self.reward_ignore_threshold = reward_ignore_threshold self.weighted_by_inverse_propensity = weighted_by_inverse_propensity self.loss_fn = _get_loss_function( loss_type, reward_ignore_threshold, weighted_by_inverse_propensity )