def setUp(self): super(InterestExplorationTest, self).setUp() self._num_topics = 2 env_config = { 'num_candidates': 20, 'slate_size': 2, 'resample_documents': False, 'seed': 1, } self._env = interest_exploration.create_environment(env_config)
def test_interest_exploration_can_run_with_resampling(self): env_config = { 'num_candidates': 5, 'slate_size': 2, 'resample_documents': True, 'seed': 100, } params = recsim_wrapper.Params( recsim_env=interest_exploration.create_environment(env_config)) env = recsim_wrapper.RecsimWrapper(params) test_util.run_test_simulation(env=env, stackelberg=True)
def make(self) -> gym.Env: env_config = { "slate_size": self.slate_size, "seed": self.initial_seed, "num_candidates": self.num_candidates, "resample_documents": self.resample_documents, } if self.is_interest_exploration: env = interest_exploration.create_environment(env_config) return ValueWrapper(env, lambda user, doc: 0.0) if self.single_selection: env = interest_evolution.create_environment(env_config) return ValueWrapper(env, dot_value_fn) else: env = create_multiclick_environment(env_config) return ValueWrapper(env, multi_selection_value_fn)
def test_recsim_intereset_exploration(self): num_candidate = 10 env_config = { "num_candidates": num_candidate, "slate_size": 3, "resample_documents": False, "seed": 1, } env = interest_exploration.create_environment(env_config) replay_buffer, inserted = _create_replay_buffer_and_insert(env) batch = replay_buffer.sample_transition_batch_tensor(indices=np.array([0])) npt.assert_array_almost_equal( inserted[0]["observation"]["user"].astype(np.float32), batch.state.squeeze(0), ) npt.assert_array_almost_equal( inserted[1]["observation"]["user"], batch.next_state.squeeze(0) ) docs = list(inserted[0]["observation"]["doc"].values()) next_docs = list(inserted[1]["observation"]["doc"].values()) for i in range(num_candidate): npt.assert_array_almost_equal( docs[i]["quality"], batch.doc_quality.squeeze(0)[i] ) npt.assert_array_almost_equal( next_docs[i]["quality"], batch.next_doc_quality.squeeze(0)[i] ) npt.assert_array_equal(inserted[0]["action"], batch.action.squeeze(0)) npt.assert_array_equal(inserted[1]["action"], batch.next_action.squeeze(0)) npt.assert_array_equal([0, 0, 0], batch.response_click.squeeze(0)) npt.assert_array_equal([0, 0, 0], batch.response_cluster_id.squeeze(0)) npt.assert_array_equal([0.0, 0.0, 0.0], batch.response_quality.squeeze(0)) resp = inserted[1]["observation"]["response"] for i in range(env_config["slate_size"]): npt.assert_array_equal( resp[i]["click"], batch.next_response_click.squeeze(0)[i] ) npt.assert_array_equal( resp[i]["cluster_id"], batch.next_response_cluster_id.squeeze(0)[i] ) npt.assert_array_almost_equal( resp[i]["quality"].astype(np.float32), batch.next_response_quality.squeeze(0)[i], )
def test_recsim_interest_exploration(self): num_candidate = 10 env_config = { "num_candidates": num_candidate, "slate_size": 3, "resample_documents": False, "seed": 1, } env = interest_exploration.create_environment(env_config) env = ValueWrapper(env, ValueMode.CONST) obs_preprocessor = make_default_obs_preprocessor(env) obs = env.reset() state = obs_preprocessor(obs) self.assertFalse(state.has_float_features_only) self.assertEqual(state.float_features.shape, (1, obs["user"].shape[0])) self.assertEqual(state.float_features.dtype, torch.float32) self.assertEqual(state.float_features.device, torch.device("cpu")) npt.assert_array_almost_equal(obs["user"], state.float_features.squeeze(0)) doc_float_features = state.candidate_docs.float_features self.assertIsNotNone(doc_float_features) quality_len = 1 expected_doc_feature_length = ( env.observation_space["doc"]["0"]["cluster_id"].n + quality_len ) self.assertEqual( doc_float_features.shape, (1, num_candidate, expected_doc_feature_length) ) self.assertEqual(doc_float_features.dtype, torch.float32) self.assertEqual(doc_float_features.device, torch.device("cpu")) for i, v in enumerate(obs["doc"].values()): expected_doc_feature = torch.cat( [ F.one_hot(torch.tensor(v["cluster_id"]), 2).float(), # This needs unsqueeze because it's a scalar torch.tensor(v["quality"]).unsqueeze(0).float(), ], dim=0, ) npt.assert_array_almost_equal( expected_doc_feature, doc_float_features[0, i] )
def test_create_from_recsim_interest_exploration(self): env_config = { "num_candidates": 20, "slate_size": 3, "resample_documents": False, "seed": 1, } env = interest_exploration.create_environment(env_config) replay_buffer = ReplayBuffer.create_from_env(env, replay_memory_size=100, batch_size=10, store_log_prob=True) obs = env.reset() observation = obs["user"] action = env.action_space.sample() log_prob = -1.0 quality = np.stack([v["quality"] for v in obs["doc"].values()], axis=0) cluster_id = np.array([v["cluster_id"] for v in obs["doc"].values()]) next_obs, reward, terminal, _env = env.step(action) response = next_obs["response"] click = np.array([r["click"] for r in response]) response_quality = np.stack([r["quality"] for r in response], axis=0) repsonse_cluster_id = np.array([r["cluster_id"] for r in response]) replay_buffer.add( observation, action, reward, terminal, mdp_id=0, sequence_number=0, doc_quality=quality, doc_cluster_id=cluster_id, response_click=click, response_cluster_id=repsonse_cluster_id, response_quality=response_quality, log_prob=log_prob, )
def test_step(self): # Initialize agent. env_config = { 'slate_size': 1, 'num_candidates': 5, 'resample_documents': True, 'seed': 1, } env = ie.create_environment(env_config) kwargs = { 'observation_space': env.observation_space, 'action_space': env.action_space, } agent = cluster_click_statistics.ClusterClickStatsLayer( cluster_bandit_agent.ClusterBanditAgent, **kwargs) observation1, documents1 = env.environment.reset() slate1 = agent.step(0, dict(user=observation1, doc=documents1, response=None)) # Pick the document with the best quality in Topic 0. scores_c0 = [(features['quality'] if features['cluster_id'] == 0 else 0) for _, features in documents1.items()] scores_c1 = [(features['quality'] if features['cluster_id'] == 1 else 0) for _, features in documents1.items()] self.assertIn(slate1[0], [np.argmax(scores_c0), np.argmax(scores_c1)]) picked_cluster = list(documents1.values())[slate1[0]]['cluster_id'] observation2, documents, response1, _ = env.environment.step(slate1) response1_obs = [response.create_observation() for response in response1] response1_obs[0]['cluster_id'] = picked_cluster slate2 = agent.step( ie.total_clicks_reward(response1), dict(user=observation2, doc=documents, response=response1_obs)) # Pick Topic 1 because we have no observation about it. # Pick the document with the best quality there. doc_qualities = [ (features['quality'] if features['cluster_id'] != picked_cluster else 0) for _, features in documents.items() ] self.assertAllEqual(slate2, [ np.argmax(doc_qualities), ]) self.assertNotEqual( list(documents.values())[slate2[0]]['cluster_id'], picked_cluster) observation3, documents, response2, _ = env.environment.step(slate2) response2_obs = [response.create_observation() for response in response2] # Make a clicked response. response2_obs[0]['click'] = 1 response2_obs[0]['cluster_id'] = 1 - picked_cluster slate3 = agent.step( ie.total_clicks_reward(response2), dict(user=observation3, doc=documents, response=response2_obs)) # Pick the first topic which has the best UCB and then pick the document # with the best quality in it. pulls = np.array([1, 1], dtype=np.float) rewards = np.array([0, 0], dtype=np.float) rewards[1 - picked_cluster] = 1 ct = np.sqrt(2.0 * np.log(2.0)) topic_index = rewards / pulls + ct * np.sqrt(1.0 / pulls) doc_qualities = [(features['quality'] if features['cluster_id'] == np.argmax(topic_index) else 0) for _, features in documents.items()] self.assertAllEqual(slate3, [np.argmax(doc_qualities)]) agent.end_episode( ie.total_clicks_reward(response2), dict(user=observation3, doc=documents, response=response2_obs)) slate4 = agent.step(0, dict(user=observation1, doc=documents1, response=None)) self.assertAllEqual(slate4, slate1)