示例#1
0
    def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        minibatch_size: int = 1024,
        parameters: Seq2SlateParameters = field(  # noqa: B008
            default_factory=Seq2SlateParameters),
        baseline_net: Optional[BaselineNet] = None,
        baseline_warmup_num_batches: int = 0,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        baseline_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
    ) -> None:
        self.seq2slate_net = seq2slate_net
        self.parameters = parameters
        self.use_gpu = use_gpu

        self.minibatch_size = minibatch_size
        self.minibatch = 0

        self.baseline_net = baseline_net
        self.baseline_warmup_num_batches = baseline_warmup_num_batches

        self.rl_opt = policy_optimizer.make_optimizer(
            self.seq2slate_net.parameters())
        if self.baseline_net:
            self.baseline_opt = baseline_optimizer.make_optimizer(
                # pyre-fixme[16]: `Optional` has no attribute `parameters`.
                self.baseline_net.parameters())

        assert (self.parameters.importance_sampling_clamp_max is None
                or not self.parameters.on_policy), (
                    "importance_sampling_clamp_max is not useful and should "
                    "be set to None in on-policy learning")
示例#2
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        use_gpu: bool = False,
        # Start ParametricDQNTrainerParameters
        rl: rlp.RLParameters = field(
            default_factory=rlp.RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
    ) -> None:
        super().__init__(rl, use_gpu=use_gpu)

        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step or 1

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            self.q_network.parameters())

        self.reward_network = reward_network
        self.reward_network_optimizer = optimizer.make_optimizer(
            self.reward_network.parameters())
示例#3
0
    def __init__(
        self,
        seq2slate_net: Seq2SlateTransformerNet,
        minibatch_size: int = 1024,
        parameters: Seq2SlateParameters = field(  # noqa: B008
            default_factory=Seq2SlateParameters),
        baseline_net: Optional[BaselineNet] = None,
        baseline_warmup_num_batches: int = 0,
        use_gpu: bool = False,
        policy_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        baseline_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        print_interval: int = 100,
    ) -> None:
        self.seq2slate_net = seq2slate_net
        self.parameters = parameters
        self.use_gpu = use_gpu
        self.print_interval = print_interval

        self.minibatch_size = minibatch_size
        self.minibatch = 0

        self.baseline_net = baseline_net
        self.baseline_warmup_num_batches = baseline_warmup_num_batches

        self.rl_opt = policy_optimizer.make_optimizer(
            self.seq2slate_net.parameters())
        if self.baseline_net:
            self.baseline_opt = baseline_optimizer.make_optimizer(
                # pyre-fixme[16]: `Optional` has no attribute `parameters`.
                self.baseline_net.parameters())
 def _initialize_cpe(
     self,
     reward_network,
     q_network_cpe,
     q_network_cpe_target,
     optimizer: Optimizer__Union,
 ) -> None:
     if self.calc_cpe_in_training:
         assert reward_network is not None, "reward_network is required for CPE"
         # pyre-fixme[16]: `RLTrainer` has no attribute `reward_network`.
         self.reward_network = reward_network
         # pyre-fixme[16]: `RLTrainer` has no attribute `reward_network_optimizer`.
         self.reward_network_optimizer = optimizer.make_optimizer_scheduler(
             self.reward_network.parameters())
         assert (
             q_network_cpe is not None and q_network_cpe_target is not None
         ), "q_network_cpe and q_network_cpe_target are required for CPE"
         # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe`.
         self.q_network_cpe = q_network_cpe
         # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe_target`.
         self.q_network_cpe_target = q_network_cpe_target
         # pyre-fixme[16]: `RLTrainer` has no attribute `q_network_cpe_optimizer`.
         self.q_network_cpe_optimizer = optimizer.make_optimizer_scheduler(
             self.q_network_cpe.parameters())
         num_output_nodes = len(self.metrics_to_score) * self.num_actions
         # pyre-fixme[16]: `RLTrainer` has no attribute `reward_idx_offsets`.
         self.reward_idx_offsets = torch.arange(
             0,
             num_output_nodes,
             self.num_actions,
             device=self.device,
             dtype=torch.long,
         )
     else:
         self.reward_network = None
示例#5
0
    def __init__(
        self,
        policy: Policy,
        gamma: float = 0.0,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
        optimizer_value_net: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default
        ),
        off_policy: bool = False,
        reward_clip: float = 1e6,
        normalize: bool = True,
        subtract_mean: bool = True,
        offset_clamp_min: bool = False,
        update_freq: int = 100,  # how many env steps between updates
        update_epochs: int = 5,  # how many epochs to run when updating (for PPO)
        ppo_batch_size: int = 10,  # batch size (number of trajectories) used for PPO updates
        ppo_epsilon: float = 0.2,  # clamp importance weights between 1-epsilon and 1+epsilon
        entropy_weight: float = 0.0,  # weight of the entropy term in the PPO loss
        value_net: Optional[ModelBase] = None,
    ):
        self.scorer = policy.scorer
        self.sampler = policy.sampler
        self.gamma = gamma
        self.optimizer_value_net = optimizer_value_net
        self.off_policy = off_policy
        self.reward_clip = reward_clip
        self.normalize = normalize
        self.subtract_mean = subtract_mean
        self.offset_clamp_min = offset_clamp_min
        self.update_freq = update_freq
        self.update_epochs = update_epochs
        self.ppo_batch_size = ppo_batch_size
        self.ppo_epsilon = ppo_epsilon
        self.entropy_weight = entropy_weight

        self.optimizer = optimizer.make_optimizer(self.scorer.parameters())
        if value_net is not None:
            self.value_net = value_net
            self.value_net_optimizer = optimizer_value_net.make_optimizer(
                self.value_net.parameters()
            )
            self.value_loss_fn = torch.nn.MSELoss(reduction="mean")
        else:
            self.value_net = None
            self.value_net_optimizer = None
        assert (ppo_epsilon >= 0) and (
            ppo_epsilon <= 1
        ), "ppo_epslion has to be in [0;1]"
        self.step = 0
        self.traj_buffer = []
示例#6
0
 def test_linear_reward_parametric_reward(self):
     """
     Reward at each step is a linear function of state and action.
     However, we can only observe aggregated reward at the last step
     """
     state_dim = 10
     action_dim = 2
     seq_len = 5
     batch_size = 512
     num_batches = 10000
     sizes = [256, 128]
     activations = ["relu", "relu"]
     last_layer_activation = "linear"
     reward_net = SingleStepSyntheticRewardNet(
         state_dim=state_dim,
         action_dim=action_dim,
         sizes=sizes,
         activations=activations,
         last_layer_activation=last_layer_activation,
     )
     optimizer = Optimizer__Union(SGD=classes["SGD"]())
     trainer = RewardNetTrainer(reward_net, optimizer)
     trainer.set_reporter(
         RewardNetworkReporter(
             trainer.loss_type,
             str(reward_net),
         )
     )
     weight, data = create_data(
         state_dim, action_dim, seq_len, batch_size, num_batches
     )
     threshold = 0.1
     avg_eval_loss = train_and_eval(trainer, data)
     assert avg_eval_loss < threshold
示例#7
0
    def test_ngram_fc_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = synthetic_reward.NGramSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
            context_size=3,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_data(state_dim, action_dim, seq_len, batch_size,
                                   num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
示例#8
0
    def test_linear_reward_parametric_reward(self):
        """
        Reward at each step is a linear function of state and action.
        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SingleStepSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
        )
        optimizer = Optimizer__Union(SGD=classes["SGD"]())
        trainer = RewardNetTrainer(reward_net, optimizer)

        weight, data_generator = create_data(state_dim, action_dim, seq_len,
                                             batch_size, num_batches)
        threshold = 0.1
        reach_threshold = False
        for batch in data_generator():
            loss = trainer.train(batch)
            if loss < threshold:
                reach_threshold = True
                break

        assert reach_threshold, f"last loss={loss}"
示例#9
0
def create_trainer(seq2slate_net, learning_method, batch_size, learning_rate,
                   device):
    use_gpu = False if device == torch.device("cpu") else True
    if learning_method == ON_POLICY:
        seq2slate_params = Seq2SlateParameters(
            on_policy=True,
            learning_method=LearningMethod.REINFORCEMENT_LEARNING)
        trainer_cls = Seq2SlateTrainer
    elif learning_method == SIMULATION:
        temp_reward_model_path = tempfile.mkstemp(suffix=".pt")[1]
        reward_model = torch.jit.script(TSPRewardModel())
        torch.jit.save(reward_model, temp_reward_model_path)
        seq2slate_params = Seq2SlateParameters(
            on_policy=True,
            learning_method=LearningMethod.SIMULATION,
            simulation=SimulationParameters(
                reward_name_weight={"tour_length": 1.0},
                reward_name_path={"tour_length": temp_reward_model_path},
            ),
        )
        trainer_cls = Seq2SlateSimulationTrainer

    param_dict = {
        "seq2slate_net": seq2slate_net,
        "minibatch_size": batch_size,
        "parameters": seq2slate_params,
        "policy_optimizer": Optimizer__Union.default(lr=learning_rate),
        "use_gpu": use_gpu,
        "print_interval": 100,
    }
    return trainer_cls(**param_dict)
示例#10
0
    def __init__(
        self,
        q_network,
        q_network_target,
        metrics_to_score=None,
        reward_network=None,
        q_network_cpe=None,
        q_network_cpe_target=None,
        loss_reporter=None,
        use_gpu: bool = False,
        actions: List[str] = field(default_factory=list),  # noqa: B008
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        num_atoms: int = 51,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        cpe_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        evaluation: EvaluationParameters = field(  # noqa: B008
            default_factory=EvaluationParameters),
    ) -> None:
        super().__init__(
            rl,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            actions=actions,
            evaluation_parameters=evaluation,
            loss_reporter=loss_reporter,
        )

        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step
        self._actions = actions

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            self.q_network.parameters())

        self.num_atoms = num_atoms
        self.quantiles = (
            (0.5 + torch.arange(self.num_atoms, device=self.device).float()) /
            float(self.num_atoms)).view(1, -1)

        self._initialize_cpe(reward_network,
                             q_network_cpe,
                             q_network_cpe_target,
                             optimizer=cpe_optimizer)

        self.reward_boosts = torch.zeros([1, len(self._actions)],
                                         device=self.device)
        if rl.reward_boost is not None:
            # pyre-fixme[16]: Optional type has no attribute `keys`.
            for k in rl.reward_boost.keys():
                i = self._actions.index(k)
                # pyre-fixme[16]: Optional type has no attribute `__getitem__`.
                self.reward_boosts[0, i] = rl.reward_boost[k]
示例#11
0
    def __init__(
        self,
        q_network,
        q_network_target,
        use_gpu: bool = False,
        # Start SlateQTrainerParameters
        rl: rlp.RLParameters = field(  # noqa: B008
            default_factory=lambda: rlp.RLParameters(maxq_learning=False)),
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        single_selection: bool = True,
        minibatch_size: int = 1024,
        evaluation: rlp.EvaluationParameters = field(  # noqa: B008
            default_factory=lambda: rlp.EvaluationParameters(
                calc_cpe_in_training=False)),
    ) -> None:
        super().__init__(rl, use_gpu=use_gpu)
        self.minibatches_per_step = 1
        self.minibatch_size = minibatch_size
        self.single_selection = single_selection

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            self.q_network.parameters())
    def test_lstm_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        last_layer_activation = "linear"
        reward_net = synthetic_reward.SequenceSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            lstm_hidden_size=128,
            lstm_num_layers=2,
            lstm_bidirectional=True,
            last_layer_activation=last_layer_activation,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
示例#13
0
    def __init__(
        self,
        actor_network,
        q1_network,
        q2_network=None,
        use_gpu: bool = False,
        # Start TD3TrainerParameters
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        q_network_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        actor_network_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        minibatch_size: int = 64,
        use_2_q_functions: bool = True,
        noise_variance: float = 0.2,
        noise_clip: float = 0.5,
        delayed_policy_update: int = 2,
        minibatches_per_step: int = 1,
    ) -> None:
        """
        Args: TODO: fill in
        """
        super().__init__(rl, use_gpu=use_gpu)

        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step or 1

        self.q1_network = q1_network
        self.q1_network_target = copy.deepcopy(self.q1_network)
        self.q1_network_optimizer = q_network_optimizer.make_optimizer(
            q1_network.parameters())

        self.q2_network = q2_network
        if self.q2_network is not None:
            self.q2_network_target = copy.deepcopy(self.q2_network)
            self.q2_network_optimizer = q_network_optimizer.make_optimizer(
                q2_network.parameters())

        self.actor_network = actor_network
        self.actor_network_target = copy.deepcopy(self.actor_network)
        self.actor_network_optimizer = actor_network_optimizer.make_optimizer(
            actor_network.parameters())

        self.noise_variance = noise_variance
        self.noise_clip_range = (-noise_clip, noise_clip)
        self.delayed_policy_update = delayed_policy_update
示例#14
0
def create_trainer(seq2slate_net, batch_size, learning_rate, device,
                   on_policy):
    use_gpu = False if device == torch.device("cpu") else True
    return Seq2SlateTrainer(
        seq2slate_net=seq2slate_net,
        minibatch_size=batch_size,
        parameters=Seq2SlateParameters(on_policy=on_policy),
        policy_optimizer=Optimizer__Union.default(lr=learning_rate),
        use_gpu=use_gpu,
        print_interval=100,
    )
示例#15
0
    def __init__(
        self,
        q_network,
        q_network_target,
        metrics_to_score=None,
        loss_reporter=None,
        use_gpu: bool = False,
        actions: List[str] = field(default_factory=list),  # noqa: B008
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        num_atoms: int = 51,
        qmin: float = -100,
        qmax: float = 200,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        evaluation: EvaluationParameters = field(  # noqa: B008
            default_factory=EvaluationParameters),
    ) -> None:
        RLTrainer.__init__(
            self,
            rl,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            actions=actions,
            loss_reporter=loss_reporter,
        )

        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step
        self._actions = actions
        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            q_network.parameters())
        self.qmin = qmin
        self.qmax = qmax
        self.num_atoms = num_atoms
        self.support = torch.linspace(self.qmin,
                                      self.qmax,
                                      self.num_atoms,
                                      device=self.device)
        self.scale_support = (self.qmax - self.qmin) / (self.num_atoms - 1.0)

        self.reward_boosts = torch.zeros([1, len(self._actions)],
                                         device=self.device)
        if rl.reward_boost is not None:
            # pyre-fixme[16]: Optional type has no attribute `keys`.
            for k in rl.reward_boost.keys():
                i = self._actions.index(k)
                # pyre-fixme[16]: Optional type has no attribute `__getitem__`.
                self.reward_boosts[0, i] = rl.reward_boost[k]
示例#16
0
 def __init__(
     self,
     reward_net: ModelBase,
     use_gpu: bool = False,
     minibatch_size: int = 1024,
     optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
 ) -> None:
     self.reward_net = reward_net
     self.use_gpu = use_gpu
     self.minibatch_size = minibatch_size
     self.minibatch = 0
     self.loss_fn = torch.nn.MSELoss(reduction="mean")
     self.opt = optimizer.make_optimizer(self.reward_net.parameters())
def create_trainer(
    seq2slate_net,
    learning_rate,
    seq2slate_params,
    policy_gradient_interval,
):
    return Seq2SlateTrainer(
        seq2slate_net=seq2slate_net,
        params=seq2slate_params,
        policy_optimizer=Optimizer__Union(SGD=classes["SGD"](
            lr=learning_rate)),
        policy_gradient_interval=policy_gradient_interval,
        print_interval=1,
    )
    def test_ngram_conv_net_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [128, 64]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        conv_net_params = rlp.ConvNetParameters(
            conv_dims=[128],
            conv_height_kernels=[1],
            pool_types=["max"],
            pool_kernel_sizes=[1],
        )
        conv_net = synthetic_reward.NGramConvolutionalNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
            context_size=3,
            conv_net_params=conv_net_params,
        )

        reward_net = synthetic_reward.NGramSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            context_size=3,
            net=conv_net,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
示例#19
0
 def __init__(
     self,
     imitator,
     use_gpu: bool = False,
     rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
     minibatch_size: int = 1024,
     minibatches_per_step: int = 1,
     optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
 ) -> None:
     super().__init__(rl, use_gpu=use_gpu)
     self.minibatch_size = minibatch_size
     self.minibatches_per_step = minibatches_per_step or 1
     self.imitator = imitator
     self.imitator_optimizer = optimizer.make_optimizer(
         imitator.parameters())
示例#20
0
 def __init__(
     self,
     reward_net: ModelBase,
     use_gpu: bool = False,
     minibatch_size: int = 1024,
     optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
     loss_type: LossFunction = LossFunction.MSE,
 ) -> None:
     self.reward_net = reward_net
     self.use_gpu = use_gpu
     self.minibatch_size = minibatch_size
     self.minibatch = 0
     self.opt = optimizer.make_optimizer(self.reward_net.parameters())
     self.loss_type = loss_type
     self.loss_fn = _get_loss_function(loss_type)
示例#21
0
    def test_transformer_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        d_model = 64
        nhead = 8
        num_encoder_layers = 1
        dim_feedforward = 64
        last_layer_activation = "linear"
        max_len = seq_len + 1
        reward_net = SyntheticRewardNet(
            TransformerSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                d_model=d_model,
                nhead=nhead,
                num_encoder_layers=num_encoder_layers,
                dim_feedforward=dim_feedforward,
                dropout=0.0,
                activation="relu",
                last_layer_activation=last_layer_activation,
                layer_norm_eps=1e-5,
                max_len=max_len,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)

        threshold = 0.25
        avg_eval_loss = train_and_eval(trainer, data)
        assert (avg_eval_loss <
                threshold), "loss = {:.4f} larger than threshold {}".format(
                    avg_eval_loss, threshold)
示例#22
0
def create_trainer(
    seq2slate_net,
    batch_size,
    learning_rate,
    device,
    seq2slate_params,
    policy_gradient_interval,
):
    use_gpu = False if device == torch.device("cpu") else True
    return Seq2SlateTrainer(
        seq2slate_net=seq2slate_net,
        minibatch_size=batch_size,
        parameters=seq2slate_params,
        policy_optimizer=Optimizer__Union(SGD=classes["SGD"](lr=learning_rate)),
        use_gpu=use_gpu,
        policy_gradient_interval=policy_gradient_interval,
        print_interval=1,
    )
示例#23
0
 def __init__(
     self,
     seq2slate_net: Seq2SlateTransformerNet,
     parameters: Seq2SlateParameters,
     minibatch_size: int,
     use_gpu: bool = False,
     policy_optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default
     ),
 ) -> None:
     self.parameters = parameters
     self.use_gpu = use_gpu
     self.seq2slate_net = seq2slate_net
     self.minibatch_size = minibatch_size
     self.minibatch = 0
     self.optimizer = policy_optimizer.make_optimizer(
         self.seq2slate_net.parameters()
     )
     self.kl_div_loss = nn.KLDivLoss(reduction="batchmean")
示例#24
0
 def __init__(
     self,
     reward_net: ModelBase,
     use_gpu: bool = False,
     optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
     loss_type: LossFunction = LossFunction.MSE,
     reward_ignore_threshold: Optional[float] = None,
     weighted_by_inverse_propensity: bool = False,
 ) -> None:
     self.reward_net = reward_net
     self.use_gpu = use_gpu
     self.minibatch = 0
     self.opt = optimizer.make_optimizer(self.reward_net.parameters())
     self.loss_type = loss_type
     self.reward_ignore_threshold = reward_ignore_threshold
     self.weighted_by_inverse_propensity = weighted_by_inverse_propensity
     self.loss_fn = _get_loss_function(loss_type, reward_ignore_threshold,
                                       weighted_by_inverse_propensity)
 def __init__(
     self,
     seq2slate_net: Seq2SlateTransformerNet,
     minibatch_size: int = 1024,
     loss_reporter=None,
     use_gpu: bool = False,
     policy_optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
 ) -> None:
     self.loss_reporter = loss_reporter
     self.use_gpu = use_gpu
     self.seq2slate_net = seq2slate_net
     self.minibatch_size = minibatch_size
     self.minibatch = 0
     self.optimizer = policy_optimizer.make_optimizer_scheduler(
         self.seq2slate_net.parameters())["optimizer"]
     self.log_softmax = nn.LogSoftmax(dim=1)
     self.kl_loss = nn.KLDivLoss(reduction="batchmean")
     if self.loss_reporter is None:
         self.loss_reporter = NoOpLossReporter()
示例#26
0
 def __init__(
     self,
     seq2slate_net: Seq2SlateTransformerNet,
     parameters: Seq2SlateParameters,
     minibatch_size: int,
     use_gpu: bool = False,
     policy_optimizer: Optimizer__Union = field(  # noqa: B008
         default_factory=Optimizer__Union.default),
     print_interval: int = 100,
 ) -> None:
     self.parameters = parameters
     self.use_gpu = use_gpu
     self.print_interval = print_interval
     self.seq2slate_net = seq2slate_net
     self.minibatch_size = minibatch_size
     self.minibatch = 0
     self.optimizer = policy_optimizer.make_optimizer(
         self.seq2slate_net.parameters())
     # TODO: T62269969 add baseline_net in training
     self.kl_div_loss = nn.KLDivLoss(reduction="none")
示例#27
0
    def _test_linear_reward_parametric_reward(
            self, ground_truth_reward_from_multiple_steps=False):
        """
        Reward at each step is a linear function of present state and action.
        However, we can only observe aggregated reward at the last step

        This model will fail to learn when ground-truth reward is a function of
        multiple steps' states and actions.
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SyntheticRewardNet(
            SingleStepSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                sizes=sizes,
                activations=activations,
                last_layer_activation=last_layer_activation,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        if ground_truth_reward_from_multiple_steps:
            weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                                batch_size, num_batches)
        else:
            weight, data = create_data(state_dim, action_dim, seq_len,
                                       batch_size, num_batches)
        avg_eval_loss = train_and_eval(trainer, data)
        return avg_eval_loss
示例#28
0
    def __init__(
        self,
        actor_network,
        q1_network,
        q2_network=None,
        value_network=None,
        use_gpu: bool = False,
        # Start SACTrainerParameters
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        q_network_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        value_network_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        actor_network_optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        alpha_optimizer: Optional[Optimizer__Union] = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        minibatch_size: int = 1024,
        entropy_temperature: float = 0.01,
        logged_action_uniform_prior: bool = True,
        target_entropy: float = -1.0,
        action_embedding_kld_weight: Optional[float] = None,
        apply_kld_on_mean: bool = False,
        action_embedding_mean: Optional[List[float]] = None,
        action_embedding_variance: Optional[List[float]] = None,
    ) -> None:
        """
        Args:
            actor_network: states -> actions, trained to maximize soft value,
                which is value + policy entropy.
            q1_network: states, action -> q-value
            q2_network (optional): double q-learning to stabilize training
                from overestimation bias
            value_network (optional): states -> value of state under actor
            # alpha in the paper; controlling explore & exploit
            # TODO: finish
        """
        super().__init__(rl, use_gpu=use_gpu)

        self.minibatch_size = minibatch_size
        self.minibatches_per_step = 1

        self.q1_network = q1_network
        self.q1_network_optimizer = q_network_optimizer.make_optimizer(
            q1_network.parameters())

        self.q2_network = q2_network
        if self.q2_network is not None:
            self.q2_network_optimizer = q_network_optimizer.make_optimizer(
                q2_network.parameters())

        self.value_network = value_network
        if self.value_network is not None:
            self.value_network_optimizer = value_network_optimizer.make_optimizer(
                value_network.parameters())
            self.value_network_target = copy.deepcopy(self.value_network)
        else:
            self.q1_network_target = copy.deepcopy(self.q1_network)
            self.q2_network_target = copy.deepcopy(self.q2_network)

        self.actor_network = actor_network
        self.actor_network_optimizer = actor_network_optimizer.make_optimizer(
            actor_network.parameters())
        self.entropy_temperature = entropy_temperature

        self.alpha_optimizer = None
        device = "cuda" if use_gpu else "cpu"
        if alpha_optimizer is not None:
            self.target_entropy = target_entropy
            self.log_alpha = torch.tensor([np.log(self.entropy_temperature)],
                                          requires_grad=True,
                                          device=device)
            self.alpha_optimizer = alpha_optimizer.make_optimizer(
                [self.log_alpha])

        self.logged_action_uniform_prior = logged_action_uniform_prior

        self.add_kld_to_loss = bool(action_embedding_kld_weight)
        self.apply_kld_on_mean = apply_kld_on_mean

        if self.add_kld_to_loss:
            self.kld_weight = action_embedding_kld_weight
            self.action_emb_mean = torch.tensor(action_embedding_mean,
                                                device=device)
            self.action_emb_variance = torch.tensor(action_embedding_variance,
                                                    device=device)
示例#29
0
    def __init__(
        self,
        q_network,
        q_network_target,
        reward_network,
        q_network_cpe=None,
        q_network_cpe_target=None,
        metrics_to_score=None,
        imitator=None,
        loss_reporter=None,
        use_gpu: bool = False,
        actions: List[str] = field(default_factory=list),  # noqa: B008
        rl: RLParameters = field(default_factory=RLParameters),  # noqa: B008
        double_q_learning: bool = True,
        bcq: Optional[BCQConfig] = None,
        minibatch_size: int = 1024,
        minibatches_per_step: int = 1,
        optimizer: Optimizer__Union = field(  # noqa: B008
            default_factory=Optimizer__Union.default),
        evaluation: EvaluationParameters = field(  # noqa: B008
            default_factory=EvaluationParameters),
    ) -> None:
        super().__init__(
            rl,
            use_gpu=use_gpu,
            metrics_to_score=metrics_to_score,
            actions=actions,
            evaluation_parameters=evaluation,
            loss_reporter=loss_reporter,
        )
        assert self._actions is not None, "Discrete-action DQN needs action names"
        self.double_q_learning = double_q_learning
        self.minibatch_size = minibatch_size
        self.minibatches_per_step = minibatches_per_step or 1

        self.q_network = q_network
        self.q_network_target = q_network_target
        self.q_network_optimizer = optimizer.make_optimizer(
            q_network.parameters())

        self._initialize_cpe(reward_network,
                             q_network_cpe,
                             q_network_cpe_target,
                             optimizer=optimizer)

        # pyre-fixme[6]: Expected `Sized` for 1st param but got `Optional[List[str]]`.
        self.reward_boosts = torch.zeros([1, len(self._actions)],
                                         device=self.device)
        if rl.reward_boost is not None:
            # pyre-fixme[16]: `Optional` has no attribute `keys`.
            for k in rl.reward_boost.keys():
                # pyre-fixme[16]: `Optional` has no attribute `index`.
                i = self._actions.index(k)
                # pyre-fixme[16]: `Optional` has no attribute `__getitem__`.
                self.reward_boosts[0, i] = rl.reward_boost[k]

        # Batch constrained q-learning
        self.bcq = bcq is not None
        if self.bcq:
            assert bcq is not None
            self.bcq_drop_threshold = bcq.drop_threshold
            self.bcq_imitator = imitator