def __init__( self, seq2slate_net: Seq2SlateTransformerNet, minibatch_size: int = 1024, parameters: Seq2SlateParameters = field( # noqa: B008 default_factory=Seq2SlateParameters), baseline_net: Optional[BaselineNet] = None, baseline_warmup_num_batches: int = 0, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), baseline_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: self.seq2slate_net = seq2slate_net self.parameters = parameters self.use_gpu = use_gpu self.minibatch_size = minibatch_size self.minibatch = 0 self.baseline_net = baseline_net self.baseline_warmup_num_batches = baseline_warmup_num_batches self.rl_opt = policy_optimizer.make_optimizer( self.seq2slate_net.parameters()) if self.baseline_net: self.baseline_opt = baseline_optimizer.make_optimizer( # pyre-fixme[16]: `Optional` has no attribute `parameters`. self.baseline_net.parameters()) assert (self.parameters.importance_sampling_clamp_max is None or not self.parameters.on_policy), ( "importance_sampling_clamp_max is not useful and should " "be set to None in on-policy learning")
def __init__( self, q_network, q_network_target, reward_network, use_gpu: bool = False, # Start ParametricDQNTrainerParameters rl: rlp.RLParameters = field( default_factory=rlp.RLParameters), # noqa: B008 double_q_learning: bool = True, minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: super().__init__(rl, use_gpu=use_gpu) self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( self.q_network.parameters()) self.reward_network = reward_network self.reward_network_optimizer = optimizer.make_optimizer( self.reward_network.parameters())
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, minibatch_size: int = 1024, parameters: Seq2SlateParameters = field( # noqa: B008 default_factory=Seq2SlateParameters), baseline_net: Optional[BaselineNet] = None, baseline_warmup_num_batches: int = 0, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), baseline_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), print_interval: int = 100, ) -> None: self.seq2slate_net = seq2slate_net self.parameters = parameters self.use_gpu = use_gpu self.print_interval = print_interval self.minibatch_size = minibatch_size self.minibatch = 0 self.baseline_net = baseline_net self.baseline_warmup_num_batches = baseline_warmup_num_batches self.rl_opt = policy_optimizer.make_optimizer( self.seq2slate_net.parameters()) if self.baseline_net: self.baseline_opt = baseline_optimizer.make_optimizer( # pyre-fixme[16]: `Optional` has no attribute `parameters`. self.baseline_net.parameters())
def _initialize_cpe( self, reward_network, q_network_cpe, q_network_cpe_target, optimizer: Optimizer__Union, ) -> None: if self.calc_cpe_in_training: assert reward_network is not None, "reward_network is required for CPE" # pyre-fixme[16]: `RLTrainer` has no attribute `reward_network`. self.reward_network = reward_network # pyre-fixme[16]: `RLTrainer` has no attribute `reward_network_optimizer`. self.reward_network_optimizer = optimizer.make_optimizer_scheduler( self.reward_network.parameters()) assert ( q_network_cpe is not None and q_network_cpe_target is not None ), "q_network_cpe and q_network_cpe_target are required for CPE" # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe`. self.q_network_cpe = q_network_cpe # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe_target`. self.q_network_cpe_target = q_network_cpe_target # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe_optimizer`. self.q_network_cpe_optimizer = optimizer.make_optimizer_scheduler( self.q_network_cpe.parameters()) num_output_nodes = len(self.metrics_to_score) * self.num_actions # pyre-fixme[16]: `RLTrainer` has no attribute `reward_idx_offsets`. self.reward_idx_offsets = torch.arange( 0, num_output_nodes, self.num_actions, device=self.device, dtype=torch.long, ) else: self.reward_network = None
def __init__( self, policy: Policy, gamma: float = 0.0, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default ), optimizer_value_net: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default ), off_policy: bool = False, reward_clip: float = 1e6, normalize: bool = True, subtract_mean: bool = True, offset_clamp_min: bool = False, update_freq: int = 100, # how many env steps between updates update_epochs: int = 5, # how many epochs to run when updating (for PPO) ppo_batch_size: int = 10, # batch size (number of trajectories) used for PPO updates ppo_epsilon: float = 0.2, # clamp importance weights between 1-epsilon and 1+epsilon entropy_weight: float = 0.0, # weight of the entropy term in the PPO loss value_net: Optional[ModelBase] = None, ): self.scorer = policy.scorer self.sampler = policy.sampler self.gamma = gamma self.optimizer_value_net = optimizer_value_net self.off_policy = off_policy self.reward_clip = reward_clip self.normalize = normalize self.subtract_mean = subtract_mean self.offset_clamp_min = offset_clamp_min self.update_freq = update_freq self.update_epochs = update_epochs self.ppo_batch_size = ppo_batch_size self.ppo_epsilon = ppo_epsilon self.entropy_weight = entropy_weight self.optimizer = optimizer.make_optimizer(self.scorer.parameters()) if value_net is not None: self.value_net = value_net self.value_net_optimizer = optimizer_value_net.make_optimizer( self.value_net.parameters() ) self.value_loss_fn = torch.nn.MSELoss(reduction="mean") else: self.value_net = None self.value_net_optimizer = None assert (ppo_epsilon >= 0) and ( ppo_epsilon <= 1 ), "ppo_epslion has to be in [0;1]" self.step = 0 self.traj_buffer = []
def test_linear_reward_parametric_reward(self): """ Reward at each step is a linear function of state and action. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(SGD=classes["SGD"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), ) ) weight, data = create_data( state_dim, action_dim, seq_len, batch_size, num_batches ) threshold = 0.1 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_ngram_fc_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = synthetic_reward.NGramSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=3, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_linear_reward_parametric_reward(self): """ Reward at each step is a linear function of state and action. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(SGD=classes["SGD"]()) trainer = RewardNetTrainer(reward_net, optimizer) weight, data_generator = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.1 reach_threshold = False for batch in data_generator(): loss = trainer.train(batch) if loss < threshold: reach_threshold = True break assert reach_threshold, f"last loss={loss}"
def create_trainer(seq2slate_net, learning_method, batch_size, learning_rate, device): use_gpu = False if device == torch.device("cpu") else True if learning_method == ON_POLICY: seq2slate_params = Seq2SlateParameters( on_policy=True, learning_method=LearningMethod.REINFORCEMENT_LEARNING) trainer_cls = Seq2SlateTrainer elif learning_method == SIMULATION: temp_reward_model_path = tempfile.mkstemp(suffix=".pt")[1] reward_model = torch.jit.script(TSPRewardModel()) torch.jit.save(reward_model, temp_reward_model_path) seq2slate_params = Seq2SlateParameters( on_policy=True, learning_method=LearningMethod.SIMULATION, simulation=SimulationParameters( reward_name_weight={"tour_length": 1.0}, reward_name_path={"tour_length": temp_reward_model_path}, ), ) trainer_cls = Seq2SlateSimulationTrainer param_dict = { "seq2slate_net": seq2slate_net, "minibatch_size": batch_size, "parameters": seq2slate_params, "policy_optimizer": Optimizer__Union.default(lr=learning_rate), "use_gpu": use_gpu, "print_interval": 100, } return trainer_cls(**param_dict)
def __init__( self, q_network, q_network_target, metrics_to_score=None, reward_network=None, q_network_cpe=None, q_network_cpe_target=None, loss_reporter=None, use_gpu: bool = False, actions: List[str] = field(default_factory=list), # noqa: B008 rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 double_q_learning: bool = True, num_atoms: int = 51, minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), cpe_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), evaluation: EvaluationParameters = field( # noqa: B008 default_factory=EvaluationParameters), ) -> None: super().__init__( rl, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=actions, evaluation_parameters=evaluation, loss_reporter=loss_reporter, ) self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step self._actions = actions self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( self.q_network.parameters()) self.num_atoms = num_atoms self.quantiles = ( (0.5 + torch.arange(self.num_atoms, device=self.device).float()) / float(self.num_atoms)).view(1, -1) self._initialize_cpe(reward_network, q_network_cpe, q_network_cpe_target, optimizer=cpe_optimizer) self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if rl.reward_boost is not None: # pyre-fixme[16]: Optional type has no attribute `keys`. for k in rl.reward_boost.keys(): i = self._actions.index(k) # pyre-fixme[16]: Optional type has no attribute `__getitem__`. self.reward_boosts[0, i] = rl.reward_boost[k]
def __init__( self, q_network, q_network_target, use_gpu: bool = False, # Start SlateQTrainerParameters rl: rlp.RLParameters = field( # noqa: B008 default_factory=lambda: rlp.RLParameters(maxq_learning=False)), optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), single_selection: bool = True, minibatch_size: int = 1024, evaluation: rlp.EvaluationParameters = field( # noqa: B008 default_factory=lambda: rlp.EvaluationParameters( calc_cpe_in_training=False)), ) -> None: super().__init__(rl, use_gpu=use_gpu) self.minibatches_per_step = 1 self.minibatch_size = minibatch_size self.single_selection = single_selection self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( self.q_network.parameters())
def test_lstm_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 last_layer_activation = "linear" reward_net = synthetic_reward.SequenceSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, lstm_hidden_size=128, lstm_num_layers=2, lstm_bidirectional=True, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def __init__( self, actor_network, q1_network, q2_network=None, use_gpu: bool = False, # Start TD3TrainerParameters rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 q_network_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), actor_network_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), minibatch_size: int = 64, use_2_q_functions: bool = True, noise_variance: float = 0.2, noise_clip: float = 0.5, delayed_policy_update: int = 2, minibatches_per_step: int = 1, ) -> None: """ Args: TODO: fill in """ super().__init__(rl, use_gpu=use_gpu) self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.q1_network = q1_network self.q1_network_target = copy.deepcopy(self.q1_network) self.q1_network_optimizer = q_network_optimizer.make_optimizer( q1_network.parameters()) self.q2_network = q2_network if self.q2_network is not None: self.q2_network_target = copy.deepcopy(self.q2_network) self.q2_network_optimizer = q_network_optimizer.make_optimizer( q2_network.parameters()) self.actor_network = actor_network self.actor_network_target = copy.deepcopy(self.actor_network) self.actor_network_optimizer = actor_network_optimizer.make_optimizer( actor_network.parameters()) self.noise_variance = noise_variance self.noise_clip_range = (-noise_clip, noise_clip) self.delayed_policy_update = delayed_policy_update
def create_trainer(seq2slate_net, batch_size, learning_rate, device, on_policy): use_gpu = False if device == torch.device("cpu") else True return Seq2SlateTrainer( seq2slate_net=seq2slate_net, minibatch_size=batch_size, parameters=Seq2SlateParameters(on_policy=on_policy), policy_optimizer=Optimizer__Union.default(lr=learning_rate), use_gpu=use_gpu, print_interval=100, )
def __init__( self, q_network, q_network_target, metrics_to_score=None, loss_reporter=None, use_gpu: bool = False, actions: List[str] = field(default_factory=list), # noqa: B008 rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 double_q_learning: bool = True, minibatch_size: int = 1024, minibatches_per_step: int = 1, num_atoms: int = 51, qmin: float = -100, qmax: float = 200, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), evaluation: EvaluationParameters = field( # noqa: B008 default_factory=EvaluationParameters), ) -> None: RLTrainer.__init__( self, rl, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=actions, loss_reporter=loss_reporter, ) self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step self._actions = actions self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( q_network.parameters()) self.qmin = qmin self.qmax = qmax self.num_atoms = num_atoms self.support = torch.linspace(self.qmin, self.qmax, self.num_atoms, device=self.device) self.scale_support = (self.qmax - self.qmin) / (self.num_atoms - 1.0) self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if rl.reward_boost is not None: # pyre-fixme[16]: Optional type has no attribute `keys`. for k in rl.reward_boost.keys(): i = self._actions.index(k) # pyre-fixme[16]: Optional type has no attribute `__getitem__`. self.reward_boosts[0, i] = rl.reward_boost[k]
def __init__( self, reward_net: ModelBase, use_gpu: bool = False, minibatch_size: int = 1024, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: self.reward_net = reward_net self.use_gpu = use_gpu self.minibatch_size = minibatch_size self.minibatch = 0 self.loss_fn = torch.nn.MSELoss(reduction="mean") self.opt = optimizer.make_optimizer(self.reward_net.parameters())
def create_trainer( seq2slate_net, learning_rate, seq2slate_params, policy_gradient_interval, ): return Seq2SlateTrainer( seq2slate_net=seq2slate_net, params=seq2slate_params, policy_optimizer=Optimizer__Union(SGD=classes["SGD"]( lr=learning_rate)), policy_gradient_interval=policy_gradient_interval, print_interval=1, )
def test_ngram_conv_net_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [128, 64] activations = ["relu", "relu"] last_layer_activation = "linear" conv_net_params = rlp.ConvNetParameters( conv_dims=[128], conv_height_kernels=[1], pool_types=["max"], pool_kernel_sizes=[1], ) conv_net = synthetic_reward.NGramConvolutionalNetwork( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=3, conv_net_params=conv_net_params, ) reward_net = synthetic_reward.NGramSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, context_size=3, net=conv_net, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def __init__( self, imitator, use_gpu: bool = False, rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: super().__init__(rl, use_gpu=use_gpu) self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.imitator = imitator self.imitator_optimizer = optimizer.make_optimizer( imitator.parameters())
def __init__( self, reward_net: ModelBase, use_gpu: bool = False, minibatch_size: int = 1024, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), loss_type: LossFunction = LossFunction.MSE, ) -> None: self.reward_net = reward_net self.use_gpu = use_gpu self.minibatch_size = minibatch_size self.minibatch = 0 self.opt = optimizer.make_optimizer(self.reward_net.parameters()) self.loss_type = loss_type self.loss_fn = _get_loss_function(loss_type)
def test_transformer_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 d_model = 64 nhead = 8 num_encoder_layers = 1 dim_feedforward = 64 last_layer_activation = "linear" max_len = seq_len + 1 reward_net = SyntheticRewardNet( TransformerSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=0.0, activation="relu", last_layer_activation=last_layer_activation, layer_norm_eps=1e-5, max_len=max_len, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.25 avg_eval_loss = train_and_eval(trainer, data) assert (avg_eval_loss < threshold), "loss = {:.4f} larger than threshold {}".format( avg_eval_loss, threshold)
def create_trainer( seq2slate_net, batch_size, learning_rate, device, seq2slate_params, policy_gradient_interval, ): use_gpu = False if device == torch.device("cpu") else True return Seq2SlateTrainer( seq2slate_net=seq2slate_net, minibatch_size=batch_size, parameters=seq2slate_params, policy_optimizer=Optimizer__Union(SGD=classes["SGD"](lr=learning_rate)), use_gpu=use_gpu, policy_gradient_interval=policy_gradient_interval, print_interval=1, )
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, parameters: Seq2SlateParameters, minibatch_size: int, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default ), ) -> None: self.parameters = parameters self.use_gpu = use_gpu self.seq2slate_net = seq2slate_net self.minibatch_size = minibatch_size self.minibatch = 0 self.optimizer = policy_optimizer.make_optimizer( self.seq2slate_net.parameters() ) self.kl_div_loss = nn.KLDivLoss(reduction="batchmean")
def __init__( self, reward_net: ModelBase, use_gpu: bool = False, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), loss_type: LossFunction = LossFunction.MSE, reward_ignore_threshold: Optional[float] = None, weighted_by_inverse_propensity: bool = False, ) -> None: self.reward_net = reward_net self.use_gpu = use_gpu self.minibatch = 0 self.opt = optimizer.make_optimizer(self.reward_net.parameters()) self.loss_type = loss_type self.reward_ignore_threshold = reward_ignore_threshold self.weighted_by_inverse_propensity = weighted_by_inverse_propensity self.loss_fn = _get_loss_function(loss_type, reward_ignore_threshold, weighted_by_inverse_propensity)
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, minibatch_size: int = 1024, loss_reporter=None, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), ) -> None: self.loss_reporter = loss_reporter self.use_gpu = use_gpu self.seq2slate_net = seq2slate_net self.minibatch_size = minibatch_size self.minibatch = 0 self.optimizer = policy_optimizer.make_optimizer_scheduler( self.seq2slate_net.parameters())["optimizer"] self.log_softmax = nn.LogSoftmax(dim=1) self.kl_loss = nn.KLDivLoss(reduction="batchmean") if self.loss_reporter is None: self.loss_reporter = NoOpLossReporter()
def __init__( self, seq2slate_net: Seq2SlateTransformerNet, parameters: Seq2SlateParameters, minibatch_size: int, use_gpu: bool = False, policy_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), print_interval: int = 100, ) -> None: self.parameters = parameters self.use_gpu = use_gpu self.print_interval = print_interval self.seq2slate_net = seq2slate_net self.minibatch_size = minibatch_size self.minibatch = 0 self.optimizer = policy_optimizer.make_optimizer( self.seq2slate_net.parameters()) # TODO: T62269969 add baseline_net in training self.kl_div_loss = nn.KLDivLoss(reduction="none")
def _test_linear_reward_parametric_reward( self, ground_truth_reward_from_multiple_steps=False): """ Reward at each step is a linear function of present state and action. However, we can only observe aggregated reward at the last step This model will fail to learn when ground-truth reward is a function of multiple steps' states and actions. """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SyntheticRewardNet( SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) if ground_truth_reward_from_multiple_steps: weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) else: weight, data = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) avg_eval_loss = train_and_eval(trainer, data) return avg_eval_loss
def __init__( self, actor_network, q1_network, q2_network=None, value_network=None, use_gpu: bool = False, # Start SACTrainerParameters rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 q_network_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), value_network_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), actor_network_optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), alpha_optimizer: Optional[Optimizer__Union] = field( # noqa: B008 default_factory=Optimizer__Union.default), minibatch_size: int = 1024, entropy_temperature: float = 0.01, logged_action_uniform_prior: bool = True, target_entropy: float = -1.0, action_embedding_kld_weight: Optional[float] = None, apply_kld_on_mean: bool = False, action_embedding_mean: Optional[List[float]] = None, action_embedding_variance: Optional[List[float]] = None, ) -> None: """ Args: actor_network: states -> actions, trained to maximize soft value, which is value + policy entropy. q1_network: states, action -> q-value q2_network (optional): double q-learning to stabilize training from overestimation bias value_network (optional): states -> value of state under actor # alpha in the paper; controlling explore & exploit # TODO: finish """ super().__init__(rl, use_gpu=use_gpu) self.minibatch_size = minibatch_size self.minibatches_per_step = 1 self.q1_network = q1_network self.q1_network_optimizer = q_network_optimizer.make_optimizer( q1_network.parameters()) self.q2_network = q2_network if self.q2_network is not None: self.q2_network_optimizer = q_network_optimizer.make_optimizer( q2_network.parameters()) self.value_network = value_network if self.value_network is not None: self.value_network_optimizer = value_network_optimizer.make_optimizer( value_network.parameters()) self.value_network_target = copy.deepcopy(self.value_network) else: self.q1_network_target = copy.deepcopy(self.q1_network) self.q2_network_target = copy.deepcopy(self.q2_network) self.actor_network = actor_network self.actor_network_optimizer = actor_network_optimizer.make_optimizer( actor_network.parameters()) self.entropy_temperature = entropy_temperature self.alpha_optimizer = None device = "cuda" if use_gpu else "cpu" if alpha_optimizer is not None: self.target_entropy = target_entropy self.log_alpha = torch.tensor([np.log(self.entropy_temperature)], requires_grad=True, device=device) self.alpha_optimizer = alpha_optimizer.make_optimizer( [self.log_alpha]) self.logged_action_uniform_prior = logged_action_uniform_prior self.add_kld_to_loss = bool(action_embedding_kld_weight) self.apply_kld_on_mean = apply_kld_on_mean if self.add_kld_to_loss: self.kld_weight = action_embedding_kld_weight self.action_emb_mean = torch.tensor(action_embedding_mean, device=device) self.action_emb_variance = torch.tensor(action_embedding_variance, device=device)
def __init__( self, q_network, q_network_target, reward_network, q_network_cpe=None, q_network_cpe_target=None, metrics_to_score=None, imitator=None, loss_reporter=None, use_gpu: bool = False, actions: List[str] = field(default_factory=list), # noqa: B008 rl: RLParameters = field(default_factory=RLParameters), # noqa: B008 double_q_learning: bool = True, bcq: Optional[BCQConfig] = None, minibatch_size: int = 1024, minibatches_per_step: int = 1, optimizer: Optimizer__Union = field( # noqa: B008 default_factory=Optimizer__Union.default), evaluation: EvaluationParameters = field( # noqa: B008 default_factory=EvaluationParameters), ) -> None: super().__init__( rl, use_gpu=use_gpu, metrics_to_score=metrics_to_score, actions=actions, evaluation_parameters=evaluation, loss_reporter=loss_reporter, ) assert self._actions is not None, "Discrete-action DQN needs action names" self.double_q_learning = double_q_learning self.minibatch_size = minibatch_size self.minibatches_per_step = minibatches_per_step or 1 self.q_network = q_network self.q_network_target = q_network_target self.q_network_optimizer = optimizer.make_optimizer( q_network.parameters()) self._initialize_cpe(reward_network, q_network_cpe, q_network_cpe_target, optimizer=optimizer) # pyre-fixme[6]: Expected `Sized` for 1st param but got `Optional[List[str]]`. self.reward_boosts = torch.zeros([1, len(self._actions)], device=self.device) if rl.reward_boost is not None: # pyre-fixme[16]: `Optional` has no attribute `keys`. for k in rl.reward_boost.keys(): # pyre-fixme[16]: `Optional` has no attribute `index`. i = self._actions.index(k) # pyre-fixme[16]: `Optional` has no attribute `__getitem__`. self.reward_boosts[0, i] = rl.reward_boost[k] # Batch constrained q-learning self.bcq = bcq is not None if self.bcq: assert bcq is not None self.bcq_drop_threshold = bcq.drop_threshold self.bcq_imitator = imitator