def test_linear_reward_parametric_reward(self): """ Reward at each step is a linear function of state and action. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(SGD=classes["SGD"]()) trainer = RewardNetTrainer(reward_net, optimizer) weight, data_generator = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.1 reach_threshold = False for batch in data_generator(): loss = trainer.train(batch) if loss < threshold: reach_threshold = True break assert reach_threshold, f"last loss={loss}"
def test_linear_reward_parametric_reward(self): """ Reward at each step is a linear function of state and action. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(SGD=classes["SGD"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), ) ) weight, data = create_data( state_dim, action_dim, seq_len, batch_size, num_batches ) threshold = 0.1 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_lstm_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 last_layer_activation = "linear" reward_net = synthetic_reward.SequenceSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, lstm_hidden_size=128, lstm_num_layers=2, lstm_bidirectional=True, last_layer_activation=last_layer_activation, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_ngram_fc_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = synthetic_reward.NGramSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=3, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def create_trainer( seq2slate_net, learning_rate, seq2slate_params, policy_gradient_interval, ): return Seq2SlateTrainer( seq2slate_net=seq2slate_net, params=seq2slate_params, policy_optimizer=Optimizer__Union(SGD=classes["SGD"]( lr=learning_rate)), policy_gradient_interval=policy_gradient_interval, print_interval=1, )
def test_ngram_conv_net_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [128, 64] activations = ["relu", "relu"] last_layer_activation = "linear" conv_net_params = rlp.ConvNetParameters( conv_dims=[128], conv_height_kernels=[1], pool_types=["max"], pool_kernel_sizes=[1], ) conv_net = synthetic_reward.NGramConvolutionalNetwork( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=3, conv_net_params=conv_net_params, ) reward_net = synthetic_reward.NGramSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, context_size=3, net=conv_net, ) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_transformer_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 d_model = 64 nhead = 8 num_encoder_layers = 1 dim_feedforward = 64 last_layer_activation = "linear" max_len = seq_len + 1 reward_net = SyntheticRewardNet( TransformerSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=0.0, activation="relu", last_layer_activation=last_layer_activation, layer_norm_eps=1e-5, max_len=max_len, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.25 avg_eval_loss = train_and_eval(trainer, data) assert (avg_eval_loss < threshold), "loss = {:.4f} larger than threshold {}".format( avg_eval_loss, threshold)
def create_trainer( seq2slate_net, batch_size, learning_rate, device, seq2slate_params, policy_gradient_interval, ): use_gpu = False if device == torch.device("cpu") else True return Seq2SlateTrainer( seq2slate_net=seq2slate_net, minibatch_size=batch_size, parameters=seq2slate_params, policy_optimizer=Optimizer__Union(SGD=classes["SGD"](lr=learning_rate)), use_gpu=use_gpu, policy_gradient_interval=policy_gradient_interval, print_interval=1, )
def _test_linear_reward_parametric_reward( self, ground_truth_reward_from_multiple_steps=False): """ Reward at each step is a linear function of present state and action. However, we can only observe aggregated reward at the last step This model will fail to learn when ground-truth reward is a function of multiple steps' states and actions. """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SyntheticRewardNet( SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) if ground_truth_reward_from_multiple_steps: weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) else: weight, data = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) avg_eval_loss = train_and_eval(trainer, data) return avg_eval_loss