def test_lstm_synthetic_reward(self):
        state_dim = 10
        action_dim = 2
        last_layer_activation = "leaky_relu"
        net = SequenceSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            lstm_hidden_size=128,
            lstm_num_layers=2,
            lstm_bidirectional=True,
            last_layer_activation=last_layer_activation,
        )
        reward_net = SyntheticRewardNet(net)
        lstm = reward_net.export_mlp().lstm
        assert lstm.bidirectional
        assert lstm.input_size == 12
        assert lstm.hidden_size == 128
        assert lstm.num_layers == 2

        dnn = reward_net.export_mlp().fc_out
        assert dnn.in_features == 128 * 2
        assert dnn.out_features == 1

        output_activation = reward_net.export_mlp().output_activation
        assert output_activation._get_name() == "LeakyReLU"
    def test_ngram_conv_net_synthetic_reward(self):
        state_dim = 10
        action_dim = 2
        sizes = [256, 128]
        activations = ["sigmoid", "relu"]
        last_layer_activation = "leaky_relu"
        context_size = 3

        conv_net_params = rlp.ConvNetParameters(
            conv_dims=[256, 128],
            conv_height_kernels=[1, 1],
            pool_types=["max", "max"],
            pool_kernel_sizes=[1, 1],
        )
        net = NGramConvolutionalNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
            context_size=context_size,
            conv_net_params=conv_net_params,
        )

        reward_net = SyntheticRewardNet(net)
        conv_net = reward_net.export_mlp().conv_net

        assert conv_net.conv_dims == [1, 256, 128]
        assert conv_net.conv_height_kernels == [1, 1]
        assert conv_net.conv_width_kernels == [12, 1]

        assert conv_net.conv_layers[0].in_channels == 1
        assert conv_net.conv_layers[0].out_channels == 256
        assert conv_net.conv_layers[0].kernel_size == (1, 12)
        assert conv_net.conv_layers[0].stride == (1, 1)
        assert conv_net.conv_layers[1].in_channels == 256
        assert conv_net.conv_layers[1].out_channels == 128
        assert conv_net.conv_layers[1].kernel_size == (1, 1)
        assert conv_net.conv_layers[1].stride == (1, 1)

        dnn = reward_net.export_mlp().conv_net.feed_forward.dnn
        assert dnn[0].in_features == 384
        assert dnn[0].out_features == 256
        assert dnn[1]._get_name() == "Sigmoid"
        assert dnn[2].in_features == 256
        assert dnn[2].out_features == 128
        assert dnn[3]._get_name() == "ReLU"
        assert dnn[4].in_features == 128
        assert dnn[4].out_features == 1
        assert dnn[5]._get_name() == "LeakyReLU"
    def build_synthetic_reward_network(
        self,
        state_normalization_data: NormalizationData,
        action_normalization_data: Optional[NormalizationData] = None,
        discrete_action_names: Optional[List[str]] = None,
    ) -> ModelBase:
        state_dim = get_num_output_features(
            state_normalization_data.dense_normalization_parameters)

        if not discrete_action_names:
            assert action_normalization_data is not None
            action_dim = get_num_output_features(
                action_normalization_data.dense_normalization_parameters)
        else:
            action_dim = len(discrete_action_names)

        net = NGramConvolutionalNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=self.sizes,
            activations=self.activations,
            last_layer_activation=self.last_layer_activation,
            context_size=self.context_size,
            conv_net_params=self.conv_net_params,
            use_layer_norm=self.use_layer_norm,
        )
        return SyntheticRewardNet(net)
예제 #4
0
    def build_synthetic_reward_network(
        self,
        state_normalization_data: NormalizationData,
        action_normalization_data: Optional[NormalizationData] = None,
        discrete_action_names: Optional[List[str]] = None,
    ) -> ModelBase:
        state_dim = get_num_output_features(
            state_normalization_data.dense_normalization_parameters)
        if not discrete_action_names:
            assert action_normalization_data is not None
            action_dim = get_num_output_features(
                action_normalization_data.dense_normalization_parameters)
        else:
            action_dim = len(discrete_action_names)

        net = TransformerSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            d_model=self.d_model,
            nhead=self.nhead,
            num_encoder_layers=self.num_encoder_layers,
            dim_feedforward=self.dim_feedforward,
            dropout=self.dropout,
            activation=self.activation,
            last_layer_activation=self.last_layer_activation,
            layer_norm_eps=self.layer_norm_eps,
            max_len=self.max_len,
        )
        return SyntheticRewardNet(net=net)
예제 #5
0
    def test_lstm_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        last_layer_activation = "linear"
        reward_net = SyntheticRewardNet(
            SequenceSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                lstm_hidden_size=128,
                lstm_num_layers=2,
                lstm_bidirectional=True,
                last_layer_activation=last_layer_activation,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
    def test_transformer_synthetic_reward(self):
        state_dim = 10
        action_dim = 2
        d_model = 64
        nhead = 8
        num_encoder_layers = 2
        dim_feedforward = 64
        dropout = 0.0
        activation = "relu"
        last_layer_activation = "leaky_relu"
        layer_norm_eps = 1e-5
        max_len = 10

        net = TransformerSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            activation=activation,
            last_layer_activation=last_layer_activation,
            layer_norm_eps=layer_norm_eps,
            max_len=max_len,
        )

        reward_net = SyntheticRewardNet(net)
        export_net = reward_net.export_mlp()
        transformer = export_net.transformer
        assert export_net.state_dim == state_dim
        assert export_net.action_dim == action_dim
        assert export_net.d_model == d_model
        assert export_net.nhead == nhead
        assert export_net.dim_feedforward == dim_feedforward
        assert export_net.dropout == dropout
        assert export_net.activation == activation
        assert export_net.layer_norm_eps == layer_norm_eps

        assert transformer.num_layers == num_encoder_layers
        dnn_out = export_net.fc_out
        assert dnn_out.in_features == d_model
        assert dnn_out.out_features == 1

        output_activation = export_net.output_activation
        assert output_activation._get_name() == "LeakyReLU"
    def test_ngram_fc_synthetic_reward(self):
        state_dim = 10
        action_dim = 2
        sizes = [256, 128]
        activations = ["sigmoid", "relu"]
        last_layer_activation = "leaky_relu"
        context_size = 3

        net = NGramFullyConnectedNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
            context_size=context_size,
        )
        reward_net = SyntheticRewardNet(net)

        dnn = reward_net.export_mlp().fc.dnn
        assert dnn[0].in_features == (state_dim + action_dim) * context_size
        assert dnn[0].out_features == 256
        assert dnn[1]._get_name() == "Sigmoid"
        assert dnn[2].in_features == 256
        assert dnn[2].out_features == 128
        assert dnn[3]._get_name() == "ReLU"
        assert dnn[4].in_features == 128
        assert dnn[4].out_features == 1
        assert dnn[5]._get_name() == "LeakyReLU"

        valid_step = torch.tensor([[1], [2], [3]])
        batch_size = 3
        seq_len = 4
        mask = _gen_mask(valid_step, batch_size, seq_len)
        assert torch.all(
            mask
            == torch.tensor(
                [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0]]
            )
        )
    def test_single_step_synthetic_reward(self):
        state_dim = 10
        action_dim = 2
        sizes = [256, 128]
        activations = ["sigmoid", "relu"]
        last_layer_activation = "leaky_relu"
        reward_net = SyntheticRewardNet(
            SingleStepSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                sizes=sizes,
                activations=activations,
                last_layer_activation=last_layer_activation,
            )
        )
        dnn = reward_net.export_mlp().dnn
        # dnn[0] is a concat layer
        assert dnn[1].in_features == state_dim + action_dim
        assert dnn[1].out_features == 256
        assert dnn[2]._get_name() == "Sigmoid"
        assert dnn[3].in_features == 256
        assert dnn[3].out_features == 128
        assert dnn[4]._get_name() == "ReLU"
        assert dnn[5].in_features == 128
        assert dnn[5].out_features == 1
        assert dnn[6]._get_name() == "LeakyReLU"

        valid_step = torch.tensor([[1], [2], [3]])
        batch_size = 3
        seq_len = 4
        mask = _gen_mask(valid_step, batch_size, seq_len)
        assert torch.all(
            mask
            == torch.tensor(
                [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0]]
            )
        )
예제 #9
0
    def test_transformer_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        d_model = 64
        nhead = 8
        num_encoder_layers = 1
        dim_feedforward = 64
        last_layer_activation = "linear"
        max_len = seq_len + 1
        reward_net = SyntheticRewardNet(
            TransformerSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                d_model=d_model,
                nhead=nhead,
                num_encoder_layers=num_encoder_layers,
                dim_feedforward=dim_feedforward,
                dropout=0.0,
                activation="relu",
                last_layer_activation=last_layer_activation,
                layer_norm_eps=1e-5,
                max_len=max_len,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)

        threshold = 0.25
        avg_eval_loss = train_and_eval(trainer, data)
        assert (avg_eval_loss <
                threshold), "loss = {:.4f} larger than threshold {}".format(
                    avg_eval_loss, threshold)
예제 #10
0
    def test_ngram_conv_net_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [128, 64]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        conv_net_params = rlp.ConvNetParameters(
            conv_dims=[128],
            conv_height_kernels=[1],
            pool_types=["max"],
            pool_kernel_sizes=[1],
        )
        reward_net = SyntheticRewardNet(
            NGramConvolutionalNetwork(
                state_dim=state_dim,
                action_dim=action_dim,
                sizes=sizes,
                activations=activations,
                last_layer_activation=last_layer_activation,
                context_size=3,
                conv_net_params=conv_net_params,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold, "loss = {} larger than threshold {}".format(
            avg_eval_loss, threshold)
예제 #11
0
    def _test_linear_reward_parametric_reward(
            self, ground_truth_reward_from_multiple_steps=False):
        """
        Reward at each step is a linear function of present state and action.
        However, we can only observe aggregated reward at the last step

        This model will fail to learn when ground-truth reward is a function of
        multiple steps' states and actions.
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SyntheticRewardNet(
            SingleStepSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                sizes=sizes,
                activations=activations,
                last_layer_activation=last_layer_activation,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        if ground_truth_reward_from_multiple_steps:
            weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                                batch_size, num_batches)
        else:
            weight, data = create_data(state_dim, action_dim, seq_len,
                                       batch_size, num_batches)
        avg_eval_loss = train_and_eval(trainer, data)
        return avg_eval_loss
예제 #12
0
 def build_synthetic_reward_network(
     self,
     state_normalization_data: NormalizationData,
     action_normalization_data: Optional[NormalizationData] = None,
     discrete_action_names: Optional[List[str]] = None,
 ) -> ModelBase:
     state_dim = get_num_output_features(
         state_normalization_data.dense_normalization_parameters)
     if not discrete_action_names:
         assert action_normalization_data is not None
         action_dim = get_num_output_features(
             action_normalization_data.dense_normalization_parameters)
     else:
         action_dim = len(discrete_action_names)
     net = SequenceSyntheticRewardNet(
         state_dim=state_dim,
         action_dim=action_dim,
         lstm_hidden_size=self.lstm_hidden_size,
         lstm_num_layers=self.lstm_num_layers,
         lstm_bidirectional=self.lstm_bidirectional,
         last_layer_activation=self.last_layer_activation,
     )
     return SyntheticRewardNet(net=net)
예제 #13
0
    def test_ngram_fc_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SyntheticRewardNet(
            NGramFullyConnectedNetwork(
                state_dim=state_dim,
                action_dim=action_dim,
                sizes=sizes,
                activations=activations,
                last_layer_activation=last_layer_activation,
                context_size=3,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold