示例#1
0
    def test_linear_reward_parametric_reward(self):
        """
        Reward at each step is a linear function of state and action.
        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SingleStepSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
        )
        optimizer = Optimizer__Union(SGD=classes["SGD"]())
        trainer = RewardNetTrainer(reward_net, optimizer)

        weight, data_generator = create_data(state_dim, action_dim, seq_len,
                                             batch_size, num_batches)
        threshold = 0.1
        reach_threshold = False
        for batch in data_generator():
            loss = trainer.train(batch)
            if loss < threshold:
                reach_threshold = True
                break

        assert reach_threshold, f"last loss={loss}"
示例#2
0
 def test_linear_reward_parametric_reward(self):
     """
     Reward at each step is a linear function of state and action.
     However, we can only observe aggregated reward at the last step
     """
     state_dim = 10
     action_dim = 2
     seq_len = 5
     batch_size = 512
     num_batches = 10000
     sizes = [256, 128]
     activations = ["relu", "relu"]
     last_layer_activation = "linear"
     reward_net = SingleStepSyntheticRewardNet(
         state_dim=state_dim,
         action_dim=action_dim,
         sizes=sizes,
         activations=activations,
         last_layer_activation=last_layer_activation,
     )
     optimizer = Optimizer__Union(SGD=classes["SGD"]())
     trainer = RewardNetTrainer(reward_net, optimizer)
     trainer.set_reporter(
         RewardNetworkReporter(
             trainer.loss_type,
             str(reward_net),
         )
     )
     weight, data = create_data(
         state_dim, action_dim, seq_len, batch_size, num_batches
     )
     threshold = 0.1
     avg_eval_loss = train_and_eval(trainer, data)
     assert avg_eval_loss < threshold
    def test_lstm_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        last_layer_activation = "linear"
        reward_net = synthetic_reward.SequenceSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            lstm_hidden_size=128,
            lstm_num_layers=2,
            lstm_bidirectional=True,
            last_layer_activation=last_layer_activation,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
示例#4
0
    def test_ngram_fc_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = synthetic_reward.NGramSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
            context_size=3,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_data(state_dim, action_dim, seq_len, batch_size,
                                   num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
def create_trainer(
    seq2slate_net,
    learning_rate,
    seq2slate_params,
    policy_gradient_interval,
):
    return Seq2SlateTrainer(
        seq2slate_net=seq2slate_net,
        params=seq2slate_params,
        policy_optimizer=Optimizer__Union(SGD=classes["SGD"](
            lr=learning_rate)),
        policy_gradient_interval=policy_gradient_interval,
        print_interval=1,
    )
    def test_ngram_conv_net_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [128, 64]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        conv_net_params = rlp.ConvNetParameters(
            conv_dims=[128],
            conv_height_kernels=[1],
            pool_types=["max"],
            pool_kernel_sizes=[1],
        )
        conv_net = synthetic_reward.NGramConvolutionalNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
            context_size=3,
            conv_net_params=conv_net_params,
        )

        reward_net = synthetic_reward.NGramSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            context_size=3,
            net=conv_net,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
示例#7
0
    def test_transformer_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        d_model = 64
        nhead = 8
        num_encoder_layers = 1
        dim_feedforward = 64
        last_layer_activation = "linear"
        max_len = seq_len + 1
        reward_net = SyntheticRewardNet(
            TransformerSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                d_model=d_model,
                nhead=nhead,
                num_encoder_layers=num_encoder_layers,
                dim_feedforward=dim_feedforward,
                dropout=0.0,
                activation="relu",
                last_layer_activation=last_layer_activation,
                layer_norm_eps=1e-5,
                max_len=max_len,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)

        threshold = 0.25
        avg_eval_loss = train_and_eval(trainer, data)
        assert (avg_eval_loss <
                threshold), "loss = {:.4f} larger than threshold {}".format(
                    avg_eval_loss, threshold)
示例#8
0
def create_trainer(
    seq2slate_net,
    batch_size,
    learning_rate,
    device,
    seq2slate_params,
    policy_gradient_interval,
):
    use_gpu = False if device == torch.device("cpu") else True
    return Seq2SlateTrainer(
        seq2slate_net=seq2slate_net,
        minibatch_size=batch_size,
        parameters=seq2slate_params,
        policy_optimizer=Optimizer__Union(SGD=classes["SGD"](lr=learning_rate)),
        use_gpu=use_gpu,
        policy_gradient_interval=policy_gradient_interval,
        print_interval=1,
    )
示例#9
0
    def _test_linear_reward_parametric_reward(
            self, ground_truth_reward_from_multiple_steps=False):
        """
        Reward at each step is a linear function of present state and action.
        However, we can only observe aggregated reward at the last step

        This model will fail to learn when ground-truth reward is a function of
        multiple steps' states and actions.
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SyntheticRewardNet(
            SingleStepSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                sizes=sizes,
                activations=activations,
                last_layer_activation=last_layer_activation,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        if ground_truth_reward_from_multiple_steps:
            weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                                batch_size, num_batches)
        else:
            weight, data = create_data(state_dim, action_dim, seq_len,
                                       batch_size, num_batches)
        avg_eval_loss = train_and_eval(trainer, data)
        return avg_eval_loss