def test_single_step_synthetic_reward_net_builder_discrete_actions(self, ):
     builder = SyntheticRewardNetBuilder__Union(
         SingleStepSyntheticReward=SingleStepSyntheticReward()).value
     state_normalization_data = _create_norm(STATE_DIM)
     discrete_action_names = ["1", "2"]
     reward_net = builder.build_synthetic_reward_network(
         state_normalization_data,
         discrete_action_names=discrete_action_names)
     input = _create_input()
     output = reward_net(input).predicted_reward
     assert output.shape == (BATCH_SIZE, 1)
    def test_single_step_synthetic_reward_net_builder_continuous_actions(
        self, ):
        builder = SyntheticRewardNetBuilder__Union(
            SingleStepSyntheticReward=SingleStepSyntheticReward()).value
        state_normalization_data = _create_norm(STATE_DIM)
        action_normalization_data = _create_norm(ACTION_DIM, offset=STATE_DIM)
        reward_net = builder.build_synthetic_reward_network(
            state_normalization_data,
            action_normalization_data=action_normalization_data,
        )
        input = _create_input()
        output = reward_net(input).predicted_reward
        assert output.shape == (BATCH_SIZE, 1)

        predictor_wrapper = builder.build_serving_module(
            reward_net,
            state_normalization_data,
            action_normalization_data=action_normalization_data,
        )
        self.assertIsInstance(
            predictor_wrapper,
            ParametricSingleStepSyntheticRewardPredictorWrapper)
예제 #3
0
class SyntheticReward(ModelManager):
    """
    Train models to attribute single step rewards from sparse/delayed/aggregated rewards.
    Ideas from:
    1. Synthetic Returns for Long-Term Credit Assignment: https://arxiv.org/pdf/2102.12425.pdf
    2. RUDDER: Return Decomposition for Delayed Rewards: https://arxiv.org/pdf/1806.07857.pdf
    3. Optimizing Agent Behavior over Long Time Scales by Transporting Value: https://arxiv.org/pdf/1810.06721.pdf
    4. Sequence Modeling of Temporal Credit Assignment for Episodic Reinforcement Learning: https://arxiv.org/pdf/1905.13420.pdf
    """

    __hash__ = param_hash

    trainer_param: RewardNetworkTrainerParameters = field(
        default_factory=RewardNetworkTrainerParameters
    )
    net_builder: SyntheticRewardNetBuilder__Union = field(
        # pyre-fixme[28]: Unexpected keyword argument `SlateRewardTransformer`.
        default_factory=lambda: SyntheticRewardNetBuilder__Union(
            SingleStepSyntheticReward=SingleStepSyntheticReward()
        )
    )
    eval_parameters: EvaluationParameters = field(default_factory=EvaluationParameters)
    state_preprocessing_options: Optional[PreprocessingOptions] = None
    action_preprocessing_options: Optional[PreprocessingOptions] = None
    state_float_features: Optional[List[Tuple[int, str]]] = None
    parametric_action_float_features: Optional[List[Tuple[int, str]]] = None
    discrete_action_names: Optional[List[str]] = None
    # max sequence length to look back to distribute rewards
    max_seq_len: int = 5

    def __post_init_post_parse__(self):
        super().__post_init_post_parse__()
        assert self.max_seq_len is not None and self.max_seq_len > 0
        assert (
            self.state_preprocessing_options is None
            or self.state_preprocessing_options.allowedlist_features is None
        ), (
            "Please set state allowlist features in state_float_features field of "
            "config instead"
        )

        if self.discrete_action_names:
            assert (
                type(self.discrete_action_names) is list
                and len(self.discrete_action_names) > 1
            ), f"Assume this is a discrete action problem, you need to specify at least 2 actions. Got {self.discrete_action_names}."
        else:
            assert (
                self.action_preprocessing_options is None
                or self.action_preprocessing_options.allowedlist_features is None
            ), (
                "Please set action allowlist features in parametric_action_float_features field of "
                "config instead"
            )

    @property
    def state_feature_config(self) -> rlt.ModelFeatureConfig:
        return get_feature_config(self.state_float_features)

    @property
    def action_feature_config(self) -> rlt.ModelFeatureConfig:
        return get_feature_config(self.parametric_action_float_features)

    def get_data_module(
        self,
        *,
        input_table_spec: Optional[TableSpec] = None,
        reward_options: Optional[RewardOptions] = None,
        reader_options: Optional[ReaderOptions] = None,
        setup_data: Optional[Dict[str, bytes]] = None,
        saved_setup_data: Optional[Dict[str, bytes]] = None,
        resource_options: Optional[ResourceOptions] = None,
    ) -> Optional[ReAgentDataModule]:
        return SyntheticRewardDataModule(
            input_table_spec=input_table_spec,
            reward_options=reward_options,
            setup_data=setup_data,
            saved_setup_data=saved_setup_data,
            reader_options=reader_options,
            resource_options=resource_options,
            model_manager=self,
        )

    def build_trainer(
        self,
        normalization_data_map: Dict[str, NormalizationData],
        use_gpu: bool,
        reward_options: Optional[RewardOptions] = None,
    ) -> RewardNetTrainer:
        net_builder = self.net_builder.value
        action_normalization_data = None
        if not self.discrete_action_names:
            action_normalization_data = normalization_data_map[NormalizationKey.ACTION]
        synthetic_reward_network = net_builder.build_synthetic_reward_network(
            normalization_data_map[NormalizationKey.STATE],
            action_normalization_data=action_normalization_data,
            discrete_action_names=self.discrete_action_names,
        )

        trainer = RewardNetTrainer(
            synthetic_reward_network,
            # pyre-fixme[16]: `RewardNetworkTrainerParameters` has no attribute
            #  `asdict`.
            **self.trainer_param.asdict(),
        )
        return trainer

    def get_reporter(self):
        return RewardNetworkReporter(
            self.trainer_param.loss_type,
            str(self.net_builder.value),
        )

    def build_serving_module(
        self,
        trainer_module: ReAgentLightningModule,
        normalization_data_map: Dict[str, NormalizationData],
    ) -> torch.nn.Module:
        """
        Returns a TorchScript predictor module
        """
        assert isinstance(trainer_module, RewardNetTrainer)

        net_builder = self.net_builder.value
        action_normalization_data = None
        if not self.discrete_action_names:
            action_normalization_data = normalization_data_map[NormalizationKey.ACTION]
        return net_builder.build_serving_module(
            self.max_seq_len,
            trainer_module.reward_net,
            normalization_data_map[NormalizationKey.STATE],
            action_normalization_data=action_normalization_data,
            discrete_action_names=self.discrete_action_names,
        )
예제 #4
0
class SyntheticReward(ModelManager):
    """
    Train models to attribute single step rewards from sparse/delayed/aggregated rewards.
    Ideas from:
    1. Synthetic Returns for Long-Term Credit Assignment: https://arxiv.org/pdf/2102.12425.pdf
    2. RUDDER: Return Decomposition for Delayed Rewards: https://arxiv.org/pdf/1806.07857.pdf
    3. Optimizing Agent Behavior over Long Time Scales by Transporting Value: https://arxiv.org/pdf/1810.06721.pdf
    4. Sequence Modeling of Temporal Credit Assignment for Episodic Reinforcement Learning: https://arxiv.org/pdf/1905.13420.pdf
    """

    __hash__ = param_hash

    trainer_param: RewardNetworkTrainerParameters = field(
        default_factory=RewardNetworkTrainerParameters)
    net_builder: SyntheticRewardNetBuilder__Union = field(
        # pyre-fixme[28]: Unexpected keyword argument `SlateRewardTransformer`.
        default_factory=lambda: SyntheticRewardNetBuilder__Union(
            SingleStepSyntheticReward=SingleStepSyntheticReward()))
    eval_parameters: EvaluationParameters = field(
        default_factory=EvaluationParameters)
    state_preprocessing_options: Optional[PreprocessingOptions] = None
    action_preprocessing_options: Optional[PreprocessingOptions] = None
    state_float_features: Optional[List[Tuple[int, str]]] = None
    parametric_action_float_features: Optional[List[Tuple[int, str]]] = None
    discrete_action_names: Optional[List[str]] = None
    # max sequence length to look back to distribute rewards
    max_seq_len: int = 5

    def __post_init_post_parse__(self):
        super().__post_init_post_parse__()
        assert self.max_seq_len is not None and self.max_seq_len > 0
        assert (
            self.state_preprocessing_options is None
            or self.state_preprocessing_options.allowedlist_features is None
        ), ("Please set state whitelist features in state_float_features field of "
            "config instead")

        if not self.action_preprocessing_options:
            assert (
                type(self.discrete_action_names) is list
                and len(self.discrete_action_names) > 1
            ), (f"Assume this is a discrete action problem because no action_preprocessing_option "
                f"is specified. Then you need to specify at least 2 actions. Got {self.discrete_action_names}."
                )
        else:
            assert not self.discrete_action_names, (
                "If it is a parametric-action problem, please specify action_preprocessing_options "
                "and parametric_action_float_features, "
                "and do not specify discrete_action_names")
            assert self.action_preprocessing_options.allowedlist_features is None, (
                "Please set action whitelist features in parametric_action_float_features field of "
                "config instead")

    @property
    def should_generate_eval_dataset(self) -> bool:
        raise RuntimeError

    @property
    def state_feature_config(self) -> rlt.ModelFeatureConfig:
        return get_feature_config(self.state_float_features)

    @property
    def action_feature_config(self) -> rlt.ModelFeatureConfig:
        return get_feature_config(self.action_float_features)

    def run_feature_identification(
            self, input_table_spec: TableSpec) -> Dict[str, NormalizationData]:
        raise RuntimeError

    def get_data_module(
        self,
        *,
        input_table_spec: Optional[TableSpec] = None,
        reward_options: Optional[RewardOptions] = None,
        reader_options: Optional[ReaderOptions] = None,
        setup_data: Optional[Dict[str, bytes]] = None,
        saved_setup_data: Optional[Dict[str, bytes]] = None,
        resource_options: Optional[ResourceOptions] = None,
    ) -> Optional[ReAgentDataModule]:
        return SyntheticRewardDataModule(
            input_table_spec=input_table_spec,
            reward_options=reward_options,
            setup_data=setup_data,
            saved_setup_data=saved_setup_data,
            reader_options=reader_options,
            resource_options=resource_options,
            model_manager=self,
        )

    @property
    def required_normalization_keys(self) -> List[str]:
        raise RuntimeError

    def build_trainer(self, use_gpu: bool) -> RewardNetTrainer:
        net_builder = self.net_builder.value
        synthetic_reward_network = net_builder.build_synthetic_reward_network(
            self.state_normalization_data,
            action_normalization_data=self.action_normalization_data,
            discrete_action_names=self.discrete_action_names,
        )
        if use_gpu:
            synthetic_reward_network = synthetic_reward_network.cuda()

        # pyre-fixme[16]: `SyntheticReward` has no attribute `_synthetic_reward_network`.
        self._synthetic_reward_network = synthetic_reward_network
        trainer = RewardNetTrainer(
            self._synthetic_reward_network,
            # pyre-fixme[16]: `RewardNetworkTrainerParameters` has no attribute
            #  `asdict`.
            **self.trainer_param.asdict(),
        )
        return trainer

    def get_reporter(self):
        return DiscreteDQNReporter(
            self.trainer_param.actions,
            target_action_distribution=self.target_action_distribution,
        )

    def build_serving_module(self) -> torch.nn.Module:
        """
        Returns a TorchScript predictor module
        """
        assert (self._synthetic_reward_network
                is not None), "_synthetic_reward_network was not initialized"

        net_builder = self.net_builder.value
        return net_builder.build_serving_module(
            self._synthetic_reward_network,
            self.state_normalization_data,
            action_names=self.discrete_action_names,
            state_feature_config=self.state_feature_config,
        )
예제 #5
0
 def test_single_step_synthetic_reward_net_builder_continuous_actions(self):
     builder = SyntheticRewardNetBuilder__Union(
         SingleStepSyntheticReward=SingleStepSyntheticReward()).value
     self._test_synthetic_reward_net_builder_continuous_actions(builder)