def test_linear_reward_parametric_reward(self): """ Reward at each step is a linear function of state and action. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(SGD=classes["SGD"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), ) ) weight, data = create_data( state_dim, action_dim, seq_len, batch_size, num_batches ) threshold = 0.1 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_lstm_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 last_layer_activation = "linear" reward_net = synthetic_reward.SequenceSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, lstm_hidden_size=128, lstm_num_layers=2, lstm_bidirectional=True, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_linear_reward_parametric_reward(self): """ Reward at each step is a linear function of state and action. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(SGD=classes["SGD"]()) trainer = RewardNetTrainer(reward_net, optimizer) weight, data_generator = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.1 reach_threshold = False for batch in data_generator(): loss = trainer.train(batch) if loss < threshold: reach_threshold = True break assert reach_threshold, f"last loss={loss}"
def test_ngram_fc_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = synthetic_reward.NGramSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=3, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_ngram_conv_net_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [128, 64] activations = ["relu", "relu"] last_layer_activation = "linear" conv_net_params = rlp.ConvNetParameters( conv_dims=[128], conv_height_kernels=[1], pool_types=["max"], pool_kernel_sizes=[1], ) conv_net = synthetic_reward.NGramConvolutionalNetwork( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=3, conv_net_params=conv_net_params, ) reward_net = synthetic_reward.NGramSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, context_size=3, net=conv_net, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_transformer_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 d_model = 64 nhead = 8 num_encoder_layers = 1 dim_feedforward = 64 last_layer_activation = "linear" max_len = seq_len + 1 reward_net = SyntheticRewardNet( TransformerSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=0.0, activation="relu", last_layer_activation=last_layer_activation, layer_norm_eps=1e-5, max_len=max_len, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.25 avg_eval_loss = train_and_eval(trainer, data) assert (avg_eval_loss < threshold), "loss = {:.4f} larger than threshold {}".format( avg_eval_loss, threshold)
def _test_linear_reward_parametric_reward( self, ground_truth_reward_from_multiple_steps=False): """ Reward at each step is a linear function of present state and action. However, we can only observe aggregated reward at the last step This model will fail to learn when ground-truth reward is a function of multiple steps' states and actions. """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SyntheticRewardNet( SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) if ground_truth_reward_from_multiple_steps: weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) else: weight, data = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) avg_eval_loss = train_and_eval(trainer, data) return avg_eval_loss
def build_trainer(self, use_gpu: bool) -> RewardNetTrainer: net_builder = self.net_builder.value synthetic_reward_network = net_builder.build_synthetic_reward_network( self.state_normalization_data, action_normalization_data=self.action_normalization_data, discrete_action_names=self.discrete_action_names, ) # pyre-fixme[16]: `SyntheticReward` has no attribute `_synthetic_reward_network`. self._synthetic_reward_network = synthetic_reward_network trainer = RewardNetTrainer( self._synthetic_reward_network, # pyre-fixme[16]: `RewardNetworkTrainerParameters` has no attribute # `asdict`. **self.trainer_param.asdict(), ) return trainer
def build_trainer( self, normalization_data_map: Dict[str, NormalizationData], use_gpu: bool, reward_options: Optional[RewardOptions] = None, ) -> RewardNetTrainer: net_builder = self.net_builder.value action_normalization_data = None if not self.discrete_action_names: action_normalization_data = normalization_data_map[NormalizationKey.ACTION] synthetic_reward_network = net_builder.build_synthetic_reward_network( normalization_data_map[NormalizationKey.STATE], action_normalization_data=action_normalization_data, discrete_action_names=self.discrete_action_names, ) trainer = RewardNetTrainer( synthetic_reward_network, # pyre-fixme[16]: `RewardNetworkTrainerParameters` has no attribute # `asdict`. **self.trainer_param.asdict(), ) return trainer