예제 #1
0
 def test_linear_reward_parametric_reward(self):
     """
     Reward at each step is a linear function of state and action.
     However, we can only observe aggregated reward at the last step
     """
     state_dim = 10
     action_dim = 2
     seq_len = 5
     batch_size = 512
     num_batches = 10000
     sizes = [256, 128]
     activations = ["relu", "relu"]
     last_layer_activation = "linear"
     reward_net = SingleStepSyntheticRewardNet(
         state_dim=state_dim,
         action_dim=action_dim,
         sizes=sizes,
         activations=activations,
         last_layer_activation=last_layer_activation,
     )
     optimizer = Optimizer__Union(SGD=classes["SGD"]())
     trainer = RewardNetTrainer(reward_net, optimizer)
     trainer.set_reporter(
         RewardNetworkReporter(
             trainer.loss_type,
             str(reward_net),
         )
     )
     weight, data = create_data(
         state_dim, action_dim, seq_len, batch_size, num_batches
     )
     threshold = 0.1
     avg_eval_loss = train_and_eval(trainer, data)
     assert avg_eval_loss < threshold
    def test_lstm_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        last_layer_activation = "linear"
        reward_net = synthetic_reward.SequenceSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            lstm_hidden_size=128,
            lstm_num_layers=2,
            lstm_bidirectional=True,
            last_layer_activation=last_layer_activation,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
예제 #3
0
    def test_linear_reward_parametric_reward(self):
        """
        Reward at each step is a linear function of state and action.
        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SingleStepSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
        )
        optimizer = Optimizer__Union(SGD=classes["SGD"]())
        trainer = RewardNetTrainer(reward_net, optimizer)

        weight, data_generator = create_data(state_dim, action_dim, seq_len,
                                             batch_size, num_batches)
        threshold = 0.1
        reach_threshold = False
        for batch in data_generator():
            loss = trainer.train(batch)
            if loss < threshold:
                reach_threshold = True
                break

        assert reach_threshold, f"last loss={loss}"
예제 #4
0
    def test_ngram_fc_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = synthetic_reward.NGramSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
            context_size=3,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_data(state_dim, action_dim, seq_len, batch_size,
                                   num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
    def test_ngram_conv_net_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [128, 64]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        conv_net_params = rlp.ConvNetParameters(
            conv_dims=[128],
            conv_height_kernels=[1],
            pool_types=["max"],
            pool_kernel_sizes=[1],
        )
        conv_net = synthetic_reward.NGramConvolutionalNetwork(
            state_dim=state_dim,
            action_dim=action_dim,
            sizes=sizes,
            activations=activations,
            last_layer_activation=last_layer_activation,
            context_size=3,
            conv_net_params=conv_net_params,
        )

        reward_net = synthetic_reward.NGramSyntheticRewardNet(
            state_dim=state_dim,
            action_dim=action_dim,
            context_size=3,
            net=conv_net,
        )
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)
        threshold = 0.2
        avg_eval_loss = train_and_eval(trainer, data)
        assert avg_eval_loss < threshold
예제 #6
0
    def test_transformer_parametric_reward(self):
        """
        Reward at each step is a linear function of states and actions in a
        context window around the step.

        However, we can only observe aggregated reward at the last step
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 10000
        d_model = 64
        nhead = 8
        num_encoder_layers = 1
        dim_feedforward = 64
        last_layer_activation = "linear"
        max_len = seq_len + 1
        reward_net = SyntheticRewardNet(
            TransformerSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                d_model=d_model,
                nhead=nhead,
                num_encoder_layers=num_encoder_layers,
                dim_feedforward=dim_feedforward,
                dropout=0.0,
                activation="relu",
                last_layer_activation=last_layer_activation,
                layer_norm_eps=1e-5,
                max_len=max_len,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                            batch_size, num_batches)

        threshold = 0.25
        avg_eval_loss = train_and_eval(trainer, data)
        assert (avg_eval_loss <
                threshold), "loss = {:.4f} larger than threshold {}".format(
                    avg_eval_loss, threshold)
예제 #7
0
    def _test_linear_reward_parametric_reward(
            self, ground_truth_reward_from_multiple_steps=False):
        """
        Reward at each step is a linear function of present state and action.
        However, we can only observe aggregated reward at the last step

        This model will fail to learn when ground-truth reward is a function of
        multiple steps' states and actions.
        """
        state_dim = 10
        action_dim = 2
        seq_len = 5
        batch_size = 512
        num_batches = 5000
        sizes = [256, 128]
        activations = ["relu", "relu"]
        last_layer_activation = "linear"
        reward_net = SyntheticRewardNet(
            SingleStepSyntheticRewardNet(
                state_dim=state_dim,
                action_dim=action_dim,
                sizes=sizes,
                activations=activations,
                last_layer_activation=last_layer_activation,
            ))
        optimizer = Optimizer__Union(Adam=classes["Adam"]())
        trainer = RewardNetTrainer(reward_net, optimizer)
        trainer.set_reporter(
            RewardNetworkReporter(
                trainer.loss_type,
                str(reward_net),
            ))
        if ground_truth_reward_from_multiple_steps:
            weight, data = create_sequence_data(state_dim, action_dim, seq_len,
                                                batch_size, num_batches)
        else:
            weight, data = create_data(state_dim, action_dim, seq_len,
                                       batch_size, num_batches)
        avg_eval_loss = train_and_eval(trainer, data)
        return avg_eval_loss
예제 #8
0
    def build_trainer(self, use_gpu: bool) -> RewardNetTrainer:
        net_builder = self.net_builder.value
        synthetic_reward_network = net_builder.build_synthetic_reward_network(
            self.state_normalization_data,
            action_normalization_data=self.action_normalization_data,
            discrete_action_names=self.discrete_action_names,
        )

        # pyre-fixme[16]: `SyntheticReward` has no attribute `_synthetic_reward_network`.
        self._synthetic_reward_network = synthetic_reward_network
        trainer = RewardNetTrainer(
            self._synthetic_reward_network,
            # pyre-fixme[16]: `RewardNetworkTrainerParameters` has no attribute
            #  `asdict`.
            **self.trainer_param.asdict(),
        )
        return trainer
예제 #9
0
    def build_trainer(
        self,
        normalization_data_map: Dict[str, NormalizationData],
        use_gpu: bool,
        reward_options: Optional[RewardOptions] = None,
    ) -> RewardNetTrainer:
        net_builder = self.net_builder.value
        action_normalization_data = None
        if not self.discrete_action_names:
            action_normalization_data = normalization_data_map[NormalizationKey.ACTION]
        synthetic_reward_network = net_builder.build_synthetic_reward_network(
            normalization_data_map[NormalizationKey.STATE],
            action_normalization_data=action_normalization_data,
            discrete_action_names=self.discrete_action_names,
        )

        trainer = RewardNetTrainer(
            synthetic_reward_network,
            # pyre-fixme[16]: `RewardNetworkTrainerParameters` has no attribute
            #  `asdict`.
            **self.trainer_param.asdict(),
        )
        return trainer