def test_lstm_synthetic_reward(self): state_dim = 10 action_dim = 2 last_layer_activation = "leaky_relu" net = SequenceSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, lstm_hidden_size=128, lstm_num_layers=2, lstm_bidirectional=True, last_layer_activation=last_layer_activation, ) reward_net = SyntheticRewardNet(net) lstm = reward_net.export_mlp().lstm assert lstm.bidirectional assert lstm.input_size == 12 assert lstm.hidden_size == 128 assert lstm.num_layers == 2 dnn = reward_net.export_mlp().fc_out assert dnn.in_features == 128 * 2 assert dnn.out_features == 1 output_activation = reward_net.export_mlp().output_activation assert output_activation._get_name() == "LeakyReLU"
def test_ngram_conv_net_synthetic_reward(self): state_dim = 10 action_dim = 2 sizes = [256, 128] activations = ["sigmoid", "relu"] last_layer_activation = "leaky_relu" context_size = 3 conv_net_params = rlp.ConvNetParameters( conv_dims=[256, 128], conv_height_kernels=[1, 1], pool_types=["max", "max"], pool_kernel_sizes=[1, 1], ) net = NGramConvolutionalNetwork( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=context_size, conv_net_params=conv_net_params, ) reward_net = SyntheticRewardNet(net) conv_net = reward_net.export_mlp().conv_net assert conv_net.conv_dims == [1, 256, 128] assert conv_net.conv_height_kernels == [1, 1] assert conv_net.conv_width_kernels == [12, 1] assert conv_net.conv_layers[0].in_channels == 1 assert conv_net.conv_layers[0].out_channels == 256 assert conv_net.conv_layers[0].kernel_size == (1, 12) assert conv_net.conv_layers[0].stride == (1, 1) assert conv_net.conv_layers[1].in_channels == 256 assert conv_net.conv_layers[1].out_channels == 128 assert conv_net.conv_layers[1].kernel_size == (1, 1) assert conv_net.conv_layers[1].stride == (1, 1) dnn = reward_net.export_mlp().conv_net.feed_forward.dnn assert dnn[0].in_features == 384 assert dnn[0].out_features == 256 assert dnn[1]._get_name() == "Sigmoid" assert dnn[2].in_features == 256 assert dnn[2].out_features == 128 assert dnn[3]._get_name() == "ReLU" assert dnn[4].in_features == 128 assert dnn[4].out_features == 1 assert dnn[5]._get_name() == "LeakyReLU"
def build_synthetic_reward_network( self, state_normalization_data: NormalizationData, action_normalization_data: Optional[NormalizationData] = None, discrete_action_names: Optional[List[str]] = None, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) if not discrete_action_names: assert action_normalization_data is not None action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) else: action_dim = len(discrete_action_names) net = NGramConvolutionalNetwork( state_dim=state_dim, action_dim=action_dim, sizes=self.sizes, activations=self.activations, last_layer_activation=self.last_layer_activation, context_size=self.context_size, conv_net_params=self.conv_net_params, use_layer_norm=self.use_layer_norm, ) return SyntheticRewardNet(net)
def build_synthetic_reward_network( self, state_normalization_data: NormalizationData, action_normalization_data: Optional[NormalizationData] = None, discrete_action_names: Optional[List[str]] = None, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) if not discrete_action_names: assert action_normalization_data is not None action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) else: action_dim = len(discrete_action_names) net = TransformerSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, d_model=self.d_model, nhead=self.nhead, num_encoder_layers=self.num_encoder_layers, dim_feedforward=self.dim_feedforward, dropout=self.dropout, activation=self.activation, last_layer_activation=self.last_layer_activation, layer_norm_eps=self.layer_norm_eps, max_len=self.max_len, ) return SyntheticRewardNet(net=net)
def test_lstm_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 last_layer_activation = "linear" reward_net = SyntheticRewardNet( SequenceSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, lstm_hidden_size=128, lstm_num_layers=2, lstm_bidirectional=True, last_layer_activation=last_layer_activation, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold
def test_transformer_synthetic_reward(self): state_dim = 10 action_dim = 2 d_model = 64 nhead = 8 num_encoder_layers = 2 dim_feedforward = 64 dropout = 0.0 activation = "relu" last_layer_activation = "leaky_relu" layer_norm_eps = 1e-5 max_len = 10 net = TransformerSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=dropout, activation=activation, last_layer_activation=last_layer_activation, layer_norm_eps=layer_norm_eps, max_len=max_len, ) reward_net = SyntheticRewardNet(net) export_net = reward_net.export_mlp() transformer = export_net.transformer assert export_net.state_dim == state_dim assert export_net.action_dim == action_dim assert export_net.d_model == d_model assert export_net.nhead == nhead assert export_net.dim_feedforward == dim_feedforward assert export_net.dropout == dropout assert export_net.activation == activation assert export_net.layer_norm_eps == layer_norm_eps assert transformer.num_layers == num_encoder_layers dnn_out = export_net.fc_out assert dnn_out.in_features == d_model assert dnn_out.out_features == 1 output_activation = export_net.output_activation assert output_activation._get_name() == "LeakyReLU"
def test_ngram_fc_synthetic_reward(self): state_dim = 10 action_dim = 2 sizes = [256, 128] activations = ["sigmoid", "relu"] last_layer_activation = "leaky_relu" context_size = 3 net = NGramFullyConnectedNetwork( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=context_size, ) reward_net = SyntheticRewardNet(net) dnn = reward_net.export_mlp().fc.dnn assert dnn[0].in_features == (state_dim + action_dim) * context_size assert dnn[0].out_features == 256 assert dnn[1]._get_name() == "Sigmoid" assert dnn[2].in_features == 256 assert dnn[2].out_features == 128 assert dnn[3]._get_name() == "ReLU" assert dnn[4].in_features == 128 assert dnn[4].out_features == 1 assert dnn[5]._get_name() == "LeakyReLU" valid_step = torch.tensor([[1], [2], [3]]) batch_size = 3 seq_len = 4 mask = _gen_mask(valid_step, batch_size, seq_len) assert torch.all( mask == torch.tensor( [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0]] ) )
def test_single_step_synthetic_reward(self): state_dim = 10 action_dim = 2 sizes = [256, 128] activations = ["sigmoid", "relu"] last_layer_activation = "leaky_relu" reward_net = SyntheticRewardNet( SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, ) ) dnn = reward_net.export_mlp().dnn # dnn[0] is a concat layer assert dnn[1].in_features == state_dim + action_dim assert dnn[1].out_features == 256 assert dnn[2]._get_name() == "Sigmoid" assert dnn[3].in_features == 256 assert dnn[3].out_features == 128 assert dnn[4]._get_name() == "ReLU" assert dnn[5].in_features == 128 assert dnn[5].out_features == 1 assert dnn[6]._get_name() == "LeakyReLU" valid_step = torch.tensor([[1], [2], [3]]) batch_size = 3 seq_len = 4 mask = _gen_mask(valid_step, batch_size, seq_len) assert torch.all( mask == torch.tensor( [[0.0, 0.0, 0.0, 1.0], [0.0, 0.0, 1.0, 1.0], [0.0, 1.0, 1.0, 1.0]] ) )
def test_transformer_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 d_model = 64 nhead = 8 num_encoder_layers = 1 dim_feedforward = 64 last_layer_activation = "linear" max_len = seq_len + 1 reward_net = SyntheticRewardNet( TransformerSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, d_model=d_model, nhead=nhead, num_encoder_layers=num_encoder_layers, dim_feedforward=dim_feedforward, dropout=0.0, activation="relu", last_layer_activation=last_layer_activation, layer_norm_eps=1e-5, max_len=max_len, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.25 avg_eval_loss = train_and_eval(trainer, data) assert (avg_eval_loss < threshold), "loss = {:.4f} larger than threshold {}".format( avg_eval_loss, threshold)
def test_ngram_conv_net_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [128, 64] activations = ["relu", "relu"] last_layer_activation = "linear" conv_net_params = rlp.ConvNetParameters( conv_dims=[128], conv_height_kernels=[1], pool_types=["max"], pool_kernel_sizes=[1], ) reward_net = SyntheticRewardNet( NGramConvolutionalNetwork( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=3, conv_net_params=conv_net_params, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold, "loss = {} larger than threshold {}".format( avg_eval_loss, threshold)
def _test_linear_reward_parametric_reward( self, ground_truth_reward_from_multiple_steps=False): """ Reward at each step is a linear function of present state and action. However, we can only observe aggregated reward at the last step This model will fail to learn when ground-truth reward is a function of multiple steps' states and actions. """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 5000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SyntheticRewardNet( SingleStepSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) if ground_truth_reward_from_multiple_steps: weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) else: weight, data = create_data(state_dim, action_dim, seq_len, batch_size, num_batches) avg_eval_loss = train_and_eval(trainer, data) return avg_eval_loss
def build_synthetic_reward_network( self, state_normalization_data: NormalizationData, action_normalization_data: Optional[NormalizationData] = None, discrete_action_names: Optional[List[str]] = None, ) -> ModelBase: state_dim = get_num_output_features( state_normalization_data.dense_normalization_parameters) if not discrete_action_names: assert action_normalization_data is not None action_dim = get_num_output_features( action_normalization_data.dense_normalization_parameters) else: action_dim = len(discrete_action_names) net = SequenceSyntheticRewardNet( state_dim=state_dim, action_dim=action_dim, lstm_hidden_size=self.lstm_hidden_size, lstm_num_layers=self.lstm_num_layers, lstm_bidirectional=self.lstm_bidirectional, last_layer_activation=self.last_layer_activation, ) return SyntheticRewardNet(net=net)
def test_ngram_fc_parametric_reward(self): """ Reward at each step is a linear function of states and actions in a context window around the step. However, we can only observe aggregated reward at the last step """ state_dim = 10 action_dim = 2 seq_len = 5 batch_size = 512 num_batches = 10000 sizes = [256, 128] activations = ["relu", "relu"] last_layer_activation = "linear" reward_net = SyntheticRewardNet( NGramFullyConnectedNetwork( state_dim=state_dim, action_dim=action_dim, sizes=sizes, activations=activations, last_layer_activation=last_layer_activation, context_size=3, )) optimizer = Optimizer__Union(Adam=classes["Adam"]()) trainer = RewardNetTrainer(reward_net, optimizer) trainer.set_reporter( RewardNetworkReporter( trainer.loss_type, str(reward_net), )) weight, data = create_sequence_data(state_dim, action_dim, seq_len, batch_size, num_batches) threshold = 0.2 avg_eval_loss = train_and_eval(trainer, data) assert avg_eval_loss < threshold