def build_trainer(self) -> QRDQNTrainer: net_builder = self.net_builder.value q_network = net_builder.build_q_network( self.state_normalization_data, len(self.action_names), # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `num_atoms`. # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `num_atoms`. num_atoms=self.trainer_param.num_atoms, ) if self.use_gpu: q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `evaluation`. # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `evaluation`. if self.trainer_param.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(self.metrics_to_score) + 1) * len( # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`. # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `actions`. self.trainer_param.actions ) cpe_net_builder = self.cpe_net_builder.value reward_network = cpe_net_builder.build_q_network( self.state_feature_config, self.state_normalization_data, num_output_nodes, ) q_network_cpe = cpe_net_builder.build_q_network( self.state_feature_config, self.state_normalization_data, num_output_nodes, ) if self.use_gpu: reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() # pyre-fixme[16]: `DiscreteQRDQN` has no attribute `_q_network`. self._q_network = q_network trainer = QRDQNTrainer( q_network=q_network, q_network_target=q_network_target, reward_network=reward_network, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=self.metrics_to_score, loss_reporter=NoOpLossReporter(), use_gpu=self.use_gpu, # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `asdict`. # pyre-fixme[16]: `QRDQNTrainerParameters` has no attribute `asdict`. **self.trainer_param.asdict(), ) return trainer
def build_trainer(self) -> C51Trainer: net_builder = self.net_builder.value q_network = net_builder.build_q_network( state_normalization_data=self.state_normalization_data, output_dim=len(self.action_names), # pyre-fixme[16]: `C51TrainerParameters` has no attribute `num_atoms`. # pyre-fixme[16]: `C51TrainerParameters` has no attribute `num_atoms`. num_atoms=self.trainer_param.num_atoms, # pyre-fixme[16]: `C51TrainerParameters` has no attribute `qmin`. # pyre-fixme[16]: `C51TrainerParameters` has no attribute `qmin`. qmin=self.trainer_param.qmin, # pyre-fixme[16]: `C51TrainerParameters` has no attribute `qmax`. # pyre-fixme[16]: `C51TrainerParameters` has no attribute `qmax`. qmax=self.trainer_param.qmax, ) if self.use_gpu: q_network = q_network.cuda() q_network_target = q_network.get_target_network() # pyre-fixme[16]: `DiscreteC51DQN` has no attribute `_q_network`. # pyre-fixme[16]: `DiscreteC51DQN` has no attribute `_q_network`. self._q_network = q_network return C51Trainer( q_network=q_network, q_network_target=q_network_target, metrics_to_score=self.metrics_to_score, loss_reporter=NoOpLossReporter(), use_gpu=self.use_gpu, # pyre-fixme[16]: `C51TrainerParameters` has no attribute `asdict`. # pyre-fixme[16]: `C51TrainerParameters` has no attribute `asdict`. **self.trainer_param.asdict(), )
def __init__(self, seq2reward_network: Seq2RewardNetwork, params: Seq2RewardTrainerParameters): self.seq2reward_network = seq2reward_network self.params = params self.optimizer = torch.optim.Adam(self.seq2reward_network.parameters(), lr=params.learning_rate) self.minibatch_size = self.params.batch_size self.loss_reporter = NoOpLossReporter() # PageHandler must use this to activate evaluator: self.calc_cpe_in_training = self.params.calc_cpe_in_training
def build_trainer(self) -> DQNTrainer: net_builder = self.net_builder.value q_network = net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, len(self.action_names), ) if self.use_gpu: q_network = q_network.cuda() q_network_target = q_network.get_target_network() reward_network, q_network_cpe, q_network_cpe_target = None, None, None if self.trainer_param.evaluation.calc_cpe_in_training: # Metrics + reward num_output_nodes = (len(self.metrics_to_score) + 1) * len( self.trainer_param.actions) cpe_net_builder = self.cpe_net_builder.value reward_network = cpe_net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, num_output_nodes, ) q_network_cpe = cpe_net_builder.build_q_network( self.state_feature_config, self.state_normalization_parameters, num_output_nodes, ) if self.use_gpu: reward_network.cuda() q_network_cpe.cuda() q_network_cpe_target = q_network_cpe.get_target_network() self._q_network = q_network trainer = DQNTrainer( q_network, q_network_target, reward_network, self.trainer_param, self.use_gpu, q_network_cpe=q_network_cpe, q_network_cpe_target=q_network_cpe_target, metrics_to_score=self.metrics_to_score, loss_reporter=NoOpLossReporter(), ) return trainer
def __init__(self, seq2reward_network: Seq2RewardNetwork, params: Seq2RewardTrainerParameters): self.seq2reward_network = seq2reward_network self.params = params self.optimizer = torch.optim.Adam(self.seq2reward_network.parameters(), lr=params.learning_rate) self.minibatch_size = self.params.batch_size self.loss_reporter = NoOpLossReporter() # PageHandler must use this to activate evaluator: self.calc_cpe_in_training = True # Turning off Q value output during training: self.view_q_value = params.view_q_value # permutations used to do planning device = get_device(self.seq2reward_network) self.all_permut = gen_permutations( params.multi_steps, len(self.params.action_names)).to(device)
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, minibatch_size: int = 1024, loss_reporter=None, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: self.loss_reporter = loss_reporter self.use_gpu = use_gpu self.seq2slate_net = seq2slate_net self.minibatch_size = minibatch_size self.minibatch = 0 self.optimizer = policy_optimizer.make_optimizer_scheduler( self.seq2slate_net.parameters())["optimizer"] self.log_softmax = nn.LogSoftmax(dim=1) self.kl_loss = nn.KLDivLoss(reduction="batchmean") if self.loss_reporter is None: self.loss_reporter = NoOpLossReporter()
def build_trainer(self) -> C51Trainer: net_builder = self.net_builder.value q_network = net_builder.build_q_network( self.state_normalization_parameters, len(self.action_names) ) if self.use_gpu: q_network = q_network.cuda() q_network_target = q_network.get_target_network() self._q_network = q_network return C51Trainer( q_network, q_network_target, self.trainer_param, self.use_gpu, metrics_to_score=self.metrics_to_score, loss_reporter=NoOpLossReporter(), )
def __init__( self, compress_model_network: FullyConnectedNetwork, seq2reward_network: Seq2RewardNetwork, params: Seq2RewardTrainerParameters, ): self.compress_model_network = compress_model_network self.seq2reward_network = seq2reward_network self.params = params self.optimizer = torch.optim.Adam( self.compress_model_network.parameters(), lr=params.compress_model_learning_rate, ) self.minibatch_size = self.params.compress_model_batch_size self.loss_reporter = NoOpLossReporter() # PageHandler must use this to activate evaluator: self.calc_cpe_in_training = True # permutations used to do planning device = get_device(self.compress_model_network) self.all_permut = gen_permutations( params.multi_steps, len(self.params.action_names)).to(device)
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, parameters: TransformerParameters, minibatch_size: int, loss_reporter=None, use_gpu: bool = False, ) -> None: self.parameters = parameters self.loss_reporter = loss_reporter self.use_gpu = use_gpu self.seq2slate_net = seq2slate_net self.minibatch_size = minibatch_size self.minibatch = 0 self.optimizer = torch.optim.Adam( self.seq2slate_net.parameters(), lr=self.parameters.learning_rate, amsgrad=True, ) self.log_softmax = nn.LogSoftmax(dim=1) self.kl_loss = nn.KLDivLoss(reduction="batchmean") if self.loss_reporter is None: self.loss_reporter = NoOpLossReporter()
def __init__( self, seq2reward_network: Seq2RewardNetwork, params: Seq2RewardTrainerParameters ): self.seq2reward_network = seq2reward_network self.params = params self.mse_optimizer = torch.optim.Adam( self.seq2reward_network.parameters(), lr=params.learning_rate ) self.minibatch_size = self.params.batch_size self.loss_reporter = NoOpLossReporter() # PageHandler must use this to activate evaluator: self.calc_cpe_in_training = True # Turning off Q value output during training: self.view_q_value = params.view_q_value # permutations used to do planning self.all_permut = gen_permutations( params.multi_steps, len(self.params.action_names) ) self.mse_loss = nn.MSELoss(reduction="mean") # Predict how many steps are remaining from the current step self.step_predict_network = FullyConnectedNetwork( [ self.seq2reward_network.state_dim, self.params.step_predict_net_size, self.params.step_predict_net_size, self.params.multi_steps, ], ["relu", "relu", "linear"], use_layer_norm=False, ) self.step_loss = nn.CrossEntropyLoss(reduction="mean") self.step_optimizer = torch.optim.Adam( self.step_predict_network.parameters(), lr=params.learning_rate )