def test_save_load_batch_norm(self): state_dim = 8 action_dim = 4 model = FullyConnectedParametricDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) # Freezing batch_norm model.eval() expected_num_params, expected_num_inputs, expected_num_outputs = 21, 2, 1 check_save_load(self, model, expected_num_params, expected_num_inputs, expected_num_outputs)
def test_save_load_batch_norm(self): state_dim = 8 action_dim = 4 model = FullyConnectedParametricDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) # Freezing batch_norm model.eval() expected_num_params, expected_num_inputs, expected_num_outputs = 21, 2, 1 check_save_load( self, model, expected_num_params, expected_num_inputs, expected_num_outputs )
def test_basic(self): state_dim = 8 action_dim = 4 model = FullyConnectedParametricDQN( state_dim, action_dim, sizes=[8, 4], activations=["relu", "relu"], use_batch_norm=True, ) input = model.input_prototype() self.assertEqual((1, state_dim), input.state.float_features.shape) self.assertEqual((1, action_dim), input.action.float_features.shape) # Using batch norm requires more than 1 example in training, avoid that model.eval() single_q_value = model(input) self.assertEqual((1, 1), single_q_value.q_value.shape)
def test_slate_q_trainer(self): recsim = RecSim(num_users=10) # Build memory pool with random policy memory_pool = OpenAIGymMemoryPool(10000000) random_reward = recsim.rollout_policy(random_policy, memory_pool) # Train a model q_network = FullyConnectedParametricDQN( state_dim=memory_pool.state_dim, action_dim=memory_pool.action_dim, sizes=[64, 32], activations=["relu", "relu"], ) q_network = q_network.eval() recsim.reset() untrained_policy_reward = recsim.rollout_policy( partial(top_k_policy, q_network)) q_network = q_network.train() q_network_target = q_network.get_target_network() parameters = SlateQTrainerParameters() trainer = SlateQTrainer(q_network, q_network_target, parameters) for _i in range(1000): tdp = memory_pool.sample_memories( 128, model_type=ModelType.PYTORCH_PARAMETRIC_DQN.value) training_batch = tdp.as_slate_q_training_batch() trainer.train(training_batch) q_network = q_network.eval() recsim.reset() trained_policy_reward = recsim.rollout_policy( partial(top_k_policy, q_network)) print( f"Reward; random: {random_reward}; untrained: {untrained_policy_reward}; " f"trained: {trained_policy_reward}") self.assertGreater(trained_policy_reward, untrained_policy_reward) self.assertGreater(trained_policy_reward, random_reward) self.assertEqual(random_reward, 1384.0) self.assertEqual(untrained_policy_reward, 1200.0) self.assertEqual(trained_policy_reward, 1432.0)