def policy_generation(bandit, actions): historystorage = history.MemoryHistoryStorage() modelstorage = model.MemoryModelStorage() if bandit == 'Exp4P': policy = exp4p.Exp4P(actions, historystorage, modelstorage, delta=0.5, pmin=None) elif bandit == 'LinUCB': policy = linucb.LinUCB(actions, historystorage, modelstorage, 0.3, 20) elif bandit == 'LinThompSamp': policy = linthompsamp.LinThompSamp(actions, historystorage, modelstorage, d=20, delta=0.61, r=0.01, epsilon=0.71) elif bandit == 'UCB1': policy = ucb1.UCB1(actions, historystorage, modelstorage) elif bandit == 'Exp3': policy = exp3.Exp3(actions, historystorage, modelstorage, gamma=0.2) elif bandit == 'random': policy = 0 return policy
def test_update_reward(self): policy = linucb.LinUCB(self.actions, self.historystorage, self.modelstorage, 1.00, 2) history_id, action = policy.get_action([[1, 1], [2, 2], [3, 3]]) policy.reward(history_id, 1) self.assertEqual( policy._historystorage.get_history(history_id).reward, 1)
def test_get_first_action(self): policy = linucb.LinUCB(self.actions, self.historystorage, self.modelstorage, 1.00, 2) history_id, action = policy.get_action([[1, 1], [2, 2], [3, 3]]) self.assertEqual(history_id, 0) self.assertIn(action, self.actions) self.assertEqual( policy._historystorage.get_history(history_id).context, [[1, 1], [2, 2], [3, 3]])
def test_model_storage(self): policy = linucb.LinUCB(self.actions, self.historystorage, self.modelstorage, 1.00, 2) history_id, action = policy.get_action([[1, 1], [2, 2], [3, 3]]) policy.reward(history_id, 1) self.assertEqual(len(policy._modelstorage.get_model()['b']), 3) self.assertEqual(len(policy._modelstorage.get_model()['b'][1]), 2) self.assertEqual(len(policy._modelstorage.get_model()['matrix_a']), 3) self.assertEqual(policy._modelstorage.get_model()['matrix_a'][1].shape, (2, 2))
def policy_generation(bandit, actions): """ Parameters ---------- bandit: 赌博机算法 actions:动作即推荐的电影 Returns ------- policy: 生成的策略 """ historystorage = history.MemoryHistoryStorage() # 内存中历史存储记录 modelstorage = model.MemoryModelStorage() # 内存中模型存储,为了统一 if bandit == 'Exp4P': policy = exp4p.Exp4P(historystorage, modelstorage, actions, delta=0.5, p_min=None) elif bandit == 'LinUCB': #policy = linucb.LinUCB(historystorage, modelstorage, actions, 0.3, 20) policy = linucb.LinUCB(history_storage=historystorage, model_storage=modelstorage, action_storage=actions, alpha=0.3, context_dimension=18) elif bandit == 'LinThompSamp': policy = linthompsamp.LinThompSamp( historystorage, modelstorage, actions, #d=20, Supposed to be context dimension context_dimension=18, delta=0.61, R=0.01, epsilon=0.71) elif bandit == 'UCB1': policy = ucb1.UCB1(historystorage, modelstorage, actions) elif bandit == 'Exp3': policy = exp3.Exp3(historystorage, modelstorage, actions, gamma=0.2) elif bandit == 'random': policy = 0 return policy
def test_add_action(self): policy = linucb.LinUCB(self.actions, self.historystorage, self.modelstorage, 1.00, 2) history_id, action = policy.get_action([[1, 1], [2, 2], [3, 3]]) policy.add_action([4, 5]) policy.reward(history_id, 1) self.assertEqual(policy._actions, [1, 2, 3, 4, 5]) self.assertTrue( (policy._modelstorage.get_model()['matrix_a'][4] == np.identity(2) ).all()) history_id2, action2 = policy.get_action([[1, 1], [2, 2], [3, 3], [4, 4], [5, 5]]) policy.reward(history_id2, 1) self.assertFalse((policy._modelstorage.get_model()['matrix_a'][action2] == np.identity(2)).all())
def test_reward_order_descending(self): policy = linucb.LinUCB(self.actions, self.historystorage, self.modelstorage, 1.00, 2) history_id, action = policy.get_action([[1, 1], [2, 2], [3, 3]]) history_id_2, action_2 = policy.get_action([[0, 0], [3, 3], [6, 6]]) policy.reward(history_id_2, 1) self.assertEqual( policy._historystorage.get_history(history_id).context, [[1, 1], [2, 2], [3, 3]]) self.assertEqual( policy._historystorage.get_history(history_id_2).context, [[0, 0], [3, 3], [6, 6]]) self.assertEqual( policy._historystorage.get_history(history_id).reward, None) self.assertEqual( policy._historystorage.get_history(history_id_2).reward, 1)
def policy_generation(bandit, actions): historystorage = history.MemoryHistoryStorage() modelstorage = model.MemoryModelStorage() if bandit == 'Exp4P': policy = exp4p.Exp4P(historystorage, modelstorage, actions, delta=0.5, p_min=None) elif bandit == 'LinUCB': #policy = linucb.LinUCB(historystorage, modelstorage, actions, 0.3, 20) policy = linucb.LinUCB(history_storage=historystorage, model_storage=modelstorage, action_storage=actions, alpha=0.3, context_dimension=18) elif bandit == 'LinThompSamp': policy = linthompsamp.LinThompSamp( historystorage, modelstorage, actions, #d=20, Supposed to be context dimension context_dimension=18, delta=0.61, R=0.01, epsilon=0.71) elif bandit == 'UCB1': policy = ucb1.UCB1(historystorage, modelstorage, actions) elif bandit == 'Exp3': policy = exp3.Exp3(historystorage, modelstorage, actions, gamma=0.2) elif bandit == 'random': policy = 0 return policy
def policy_evaluation(self, policy, context, desired_action, alpha): if policy != 'LinUCB': print("We don't support other bandit algorithms now!") else: historystorage = history.MemoryHistoryStorage() modelstorage = model.MemoryModelStorage() # sum_error = 0 policy = linucb.LinUCB(self.actions, historystorage, modelstorage, alpha, self.d) seq_error = np.zeros(shape=(self.t, 1)) for t in range(self.t): history_id, action = policy.get_action(context[t]) if desired_action[t][0] != action: policy.reward(history_id, 0) # sum_error += 1 if t == 0: seq_error[t] = 1.0 else: seq_error[t] = seq_error[t - 1] + 1.0 else: policy.reward(history_id, 1) if t > 0: seq_error[t] = seq_error[t - 1] return seq_error
def test_initialization(self): policy = linucb.LinUCB(self.actions, self.historystorage, self.modelstorage, 1.00, 2) self.assertEqual(self.actions, policy._actions) self.assertEqual(1.00, policy.alpha) self.assertEqual(2, policy.d)