def test_single_step_synthetic_reward_net_builder_discrete_actions(self, ): builder = SyntheticRewardNetBuilder__Union( SingleStepSyntheticReward=SingleStepSyntheticReward()).value state_normalization_data = _create_norm(STATE_DIM) discrete_action_names = ["1", "2"] reward_net = builder.build_synthetic_reward_network( state_normalization_data, discrete_action_names=discrete_action_names) input = _create_input() output = reward_net(input).predicted_reward assert output.shape == (BATCH_SIZE, 1)
def test_single_step_synthetic_reward_net_builder_continuous_actions( self, ): builder = SyntheticRewardNetBuilder__Union( SingleStepSyntheticReward=SingleStepSyntheticReward()).value state_normalization_data = _create_norm(STATE_DIM) action_normalization_data = _create_norm(ACTION_DIM, offset=STATE_DIM) reward_net = builder.build_synthetic_reward_network( state_normalization_data, action_normalization_data=action_normalization_data, ) input = _create_input() output = reward_net(input).predicted_reward assert output.shape == (BATCH_SIZE, 1) predictor_wrapper = builder.build_serving_module( reward_net, state_normalization_data, action_normalization_data=action_normalization_data, ) self.assertIsInstance( predictor_wrapper, ParametricSingleStepSyntheticRewardPredictorWrapper)
class SyntheticReward(ModelManager): """ Train models to attribute single step rewards from sparse/delayed/aggregated rewards. Ideas from: 1. Synthetic Returns for Long-Term Credit Assignment: https://arxiv.org/pdf/2102.12425.pdf 2. RUDDER: Return Decomposition for Delayed Rewards: https://arxiv.org/pdf/1806.07857.pdf 3. Optimizing Agent Behavior over Long Time Scales by Transporting Value: https://arxiv.org/pdf/1810.06721.pdf 4. Sequence Modeling of Temporal Credit Assignment for Episodic Reinforcement Learning: https://arxiv.org/pdf/1905.13420.pdf """ __hash__ = param_hash trainer_param: RewardNetworkTrainerParameters = field( default_factory=RewardNetworkTrainerParameters ) net_builder: SyntheticRewardNetBuilder__Union = field( # pyre-fixme[28]: Unexpected keyword argument `SlateRewardTransformer`. default_factory=lambda: SyntheticRewardNetBuilder__Union( SingleStepSyntheticReward=SingleStepSyntheticReward() ) ) eval_parameters: EvaluationParameters = field(default_factory=EvaluationParameters) state_preprocessing_options: Optional[PreprocessingOptions] = None action_preprocessing_options: Optional[PreprocessingOptions] = None state_float_features: Optional[List[Tuple[int, str]]] = None parametric_action_float_features: Optional[List[Tuple[int, str]]] = None discrete_action_names: Optional[List[str]] = None # max sequence length to look back to distribute rewards max_seq_len: int = 5 def __post_init_post_parse__(self): super().__post_init_post_parse__() assert self.max_seq_len is not None and self.max_seq_len > 0 assert ( self.state_preprocessing_options is None or self.state_preprocessing_options.allowedlist_features is None ), ( "Please set state allowlist features in state_float_features field of " "config instead" ) if self.discrete_action_names: assert ( type(self.discrete_action_names) is list and len(self.discrete_action_names) > 1 ), f"Assume this is a discrete action problem, you need to specify at least 2 actions. Got {self.discrete_action_names}." else: assert ( self.action_preprocessing_options is None or self.action_preprocessing_options.allowedlist_features is None ), ( "Please set action allowlist features in parametric_action_float_features field of " "config instead" ) @property def state_feature_config(self) -> rlt.ModelFeatureConfig: return get_feature_config(self.state_float_features) @property def action_feature_config(self) -> rlt.ModelFeatureConfig: return get_feature_config(self.parametric_action_float_features) def get_data_module( self, *, input_table_spec: Optional[TableSpec] = None, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, setup_data: Optional[Dict[str, bytes]] = None, saved_setup_data: Optional[Dict[str, bytes]] = None, resource_options: Optional[ResourceOptions] = None, ) -> Optional[ReAgentDataModule]: return SyntheticRewardDataModule( input_table_spec=input_table_spec, reward_options=reward_options, setup_data=setup_data, saved_setup_data=saved_setup_data, reader_options=reader_options, resource_options=resource_options, model_manager=self, ) def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> RewardNetTrainer: net_builder = self.net_builder.value action_normalization_data = None if not self.discrete_action_names: action_normalization_data = normalization_data_map[NormalizationKey.ACTION] synthetic_reward_network = net_builder.build_synthetic_reward_network( normalization_data_map[NormalizationKey.STATE], action_normalization_data=action_normalization_data, discrete_action_names=self.discrete_action_names, ) trainer = RewardNetTrainer( synthetic_reward_network, # pyre-fixme[16]: `RewardNetworkTrainerParameters` has no attribute # `asdict`. **self.trainer_param.asdict(), ) return trainer def get_reporter(self): return RewardNetworkReporter( self.trainer_param.loss_type, str(self.net_builder.value), ) def build_serving_module( self, trainer_module: ReAgentLightningModule, normalization_data_map: Dict[str, NormalizationData], ) -> torch.nn.Module: """ Returns a TorchScript predictor module """ assert isinstance(trainer_module, RewardNetTrainer) net_builder = self.net_builder.value action_normalization_data = None if not self.discrete_action_names: action_normalization_data = normalization_data_map[NormalizationKey.ACTION] return net_builder.build_serving_module( self.max_seq_len, trainer_module.reward_net, normalization_data_map[NormalizationKey.STATE], action_normalization_data=action_normalization_data, discrete_action_names=self.discrete_action_names, )
class SyntheticReward(ModelManager): """ Train models to attribute single step rewards from sparse/delayed/aggregated rewards. Ideas from: 1. Synthetic Returns for Long-Term Credit Assignment: https://arxiv.org/pdf/2102.12425.pdf 2. RUDDER: Return Decomposition for Delayed Rewards: https://arxiv.org/pdf/1806.07857.pdf 3. Optimizing Agent Behavior over Long Time Scales by Transporting Value: https://arxiv.org/pdf/1810.06721.pdf 4. Sequence Modeling of Temporal Credit Assignment for Episodic Reinforcement Learning: https://arxiv.org/pdf/1905.13420.pdf """ __hash__ = param_hash trainer_param: RewardNetworkTrainerParameters = field( default_factory=RewardNetworkTrainerParameters) net_builder: SyntheticRewardNetBuilder__Union = field( # pyre-fixme[28]: Unexpected keyword argument `SlateRewardTransformer`. default_factory=lambda: SyntheticRewardNetBuilder__Union( SingleStepSyntheticReward=SingleStepSyntheticReward())) eval_parameters: EvaluationParameters = field( default_factory=EvaluationParameters) state_preprocessing_options: Optional[PreprocessingOptions] = None action_preprocessing_options: Optional[PreprocessingOptions] = None state_float_features: Optional[List[Tuple[int, str]]] = None parametric_action_float_features: Optional[List[Tuple[int, str]]] = None discrete_action_names: Optional[List[str]] = None # max sequence length to look back to distribute rewards max_seq_len: int = 5 def __post_init_post_parse__(self): super().__post_init_post_parse__() assert self.max_seq_len is not None and self.max_seq_len > 0 assert ( self.state_preprocessing_options is None or self.state_preprocessing_options.allowedlist_features is None ), ("Please set state whitelist features in state_float_features field of " "config instead") if not self.action_preprocessing_options: assert ( type(self.discrete_action_names) is list and len(self.discrete_action_names) > 1 ), (f"Assume this is a discrete action problem because no action_preprocessing_option " f"is specified. Then you need to specify at least 2 actions. Got {self.discrete_action_names}." ) else: assert not self.discrete_action_names, ( "If it is a parametric-action problem, please specify action_preprocessing_options " "and parametric_action_float_features, " "and do not specify discrete_action_names") assert self.action_preprocessing_options.allowedlist_features is None, ( "Please set action whitelist features in parametric_action_float_features field of " "config instead") @property def should_generate_eval_dataset(self) -> bool: raise RuntimeError @property def state_feature_config(self) -> rlt.ModelFeatureConfig: return get_feature_config(self.state_float_features) @property def action_feature_config(self) -> rlt.ModelFeatureConfig: return get_feature_config(self.action_float_features) def run_feature_identification( self, input_table_spec: TableSpec) -> Dict[str, NormalizationData]: raise RuntimeError def get_data_module( self, *, input_table_spec: Optional[TableSpec] = None, reward_options: Optional[RewardOptions] = None, reader_options: Optional[ReaderOptions] = None, setup_data: Optional[Dict[str, bytes]] = None, saved_setup_data: Optional[Dict[str, bytes]] = None, resource_options: Optional[ResourceOptions] = None, ) -> Optional[ReAgentDataModule]: return SyntheticRewardDataModule( input_table_spec=input_table_spec, reward_options=reward_options, setup_data=setup_data, saved_setup_data=saved_setup_data, reader_options=reader_options, resource_options=resource_options, model_manager=self, ) @property def required_normalization_keys(self) -> List[str]: raise RuntimeError def build_trainer(self, use_gpu: bool) -> RewardNetTrainer: net_builder = self.net_builder.value synthetic_reward_network = net_builder.build_synthetic_reward_network( self.state_normalization_data, action_normalization_data=self.action_normalization_data, discrete_action_names=self.discrete_action_names, ) if use_gpu: synthetic_reward_network = synthetic_reward_network.cuda() # pyre-fixme[16]: `SyntheticReward` has no attribute `_synthetic_reward_network`. self._synthetic_reward_network = synthetic_reward_network trainer = RewardNetTrainer( self._synthetic_reward_network, # pyre-fixme[16]: `RewardNetworkTrainerParameters` has no attribute # `asdict`. **self.trainer_param.asdict(), ) return trainer def get_reporter(self): return DiscreteDQNReporter( self.trainer_param.actions, target_action_distribution=self.target_action_distribution, ) def build_serving_module(self) -> torch.nn.Module: """ Returns a TorchScript predictor module """ assert (self._synthetic_reward_network is not None), "_synthetic_reward_network was not initialized" net_builder = self.net_builder.value return net_builder.build_serving_module( self._synthetic_reward_network, self.state_normalization_data, action_names=self.discrete_action_names, state_feature_config=self.state_feature_config, )
def test_single_step_synthetic_reward_net_builder_continuous_actions(self): builder = SyntheticRewardNetBuilder__Union( SingleStepSyntheticReward=SingleStepSyntheticReward()).value self._test_synthetic_reward_net_builder_continuous_actions(builder)