def run_contextual_bandit(dataset, algos, save_once=False, pkl_file=None): """Run a contextual bandit problem on a set of algorithms. Args: dataset: Matrix where every row is a context + num_actions rewards. algos: List of algorithms to use in the contextual bandit instance. save_once: True if state has been saved once before pkl_file: pickle file for saving state. Returns: h_actions: Matrix with actions: size (num_context, num_algorithms). h_rewards: Matrix with rewards: size (num_context, num_algorithms). """ num_contexts = dataset.shape[0] # Create contextual bandit cmab = contextual_bandit.ContextualBandit(context_dim, num_actions) cmab.feed_data(dataset) if not save_once or pkl_file is None: h_actions = np.empty((0, len(algos)), float) h_rewards = np.empty((0, len(algos)), float) start_context = 0 else: with gfile.Open(pkl_file, 'rb') as infile: saved_state = pickle.load(infile) start_context = saved_state['start_context'] algos[0].data_h.replace_data(saved_state['contexts'], saved_state['actions'], saved_state['rewards']) h_actions = saved_state['h_actions'] h_rewards = saved_state['h_rewards'] # Run the contextual bandit process for i in range(start_context, num_contexts): context = cmab.context(i) actions = [a.action(context) for a in algos] rewards = [cmab.reward(i, action) for action in actions] for j, a in enumerate(algos): a.update(context, actions[j], rewards[j]) h_actions = np.vstack((h_actions, np.array(actions))) h_rewards = np.vstack((h_rewards, np.array(rewards))) if (i + 1) % 500 == 0 and pkl_file is not None: savedict = { 'h_rewards': h_rewards, 'h_actions': h_actions, 'contexts': algos[0].data_h.contexts, 'actions': algos[0].data_h.actions, 'rewards': algos[0].data_h.rewards, 'start_context': i + 1 } with gfile.Open(pkl_file, 'wb') as outfile: pickle.dump(savedict, outfile) return h_actions, h_rewards
def run_contextual_bandit(context_dim, num_actions, dataset, algos, num_contexts=None): """Run a contextual bandit problem on a set of algorithms. Args: context_dim: Dimension of the context. num_actions: Number of available actions. dataset: Matrix where every row is a context + num_actions rewards. algos: List of algorithms to use in the contextual bandit instance. num_contexts: Number of contexts. Returns: h_actions: Matrix with actions: size (num_context, num_algorithms). h_rewards: Matrix with rewards: size (num_context, num_algorithms). """ if num_contexts is None: num_contexts = dataset.shape[0] # Create contextual bandit cmab = contextual_bandit.ContextualBandit(context_dim, num_actions) cmab.feed_data(dataset) h_actions = np.empty((0, len(algos)), float) h_rewards = np.empty((0, len(algos)), float) # Run the contextual bandit process for i in range(num_contexts): context = cmab.context(i) actions = [a.action(context) for a in algos] rewards = [cmab.reward(i, action) for action in actions] for j, a in enumerate(algos): a.update(context, actions[j], rewards[j]) h_actions = np.vstack((h_actions, np.array(actions))) h_rewards = np.vstack((h_rewards, np.array(rewards))) return h_actions, h_rewards