Пример #1
0
def run_contextual_bandit(dataset, algos, save_once=False, pkl_file=None):
    """Run a contextual bandit problem on a set of algorithms.

  Args:
    dataset: Matrix where every row is a context + num_actions rewards.
    algos: List of algorithms to use in the contextual bandit instance.
    save_once: True if state has been saved once before
    pkl_file: pickle file for saving state.

  Returns:
    h_actions: Matrix with actions: size (num_context, num_algorithms).
    h_rewards: Matrix with rewards: size (num_context, num_algorithms).
  """

    num_contexts = dataset.shape[0]

    # Create contextual bandit
    cmab = contextual_bandit.ContextualBandit(context_dim, num_actions)
    cmab.feed_data(dataset)
    if not save_once or pkl_file is None:
        h_actions = np.empty((0, len(algos)), float)
        h_rewards = np.empty((0, len(algos)), float)

        start_context = 0
    else:
        with gfile.Open(pkl_file, 'rb') as infile:
            saved_state = pickle.load(infile)
            start_context = saved_state['start_context']
            algos[0].data_h.replace_data(saved_state['contexts'],
                                         saved_state['actions'],
                                         saved_state['rewards'])
            h_actions = saved_state['h_actions']
            h_rewards = saved_state['h_rewards']

    # Run the contextual bandit process
    for i in range(start_context, num_contexts):
        context = cmab.context(i)
        actions = [a.action(context) for a in algos]
        rewards = [cmab.reward(i, action) for action in actions]

        for j, a in enumerate(algos):
            a.update(context, actions[j], rewards[j])

        h_actions = np.vstack((h_actions, np.array(actions)))
        h_rewards = np.vstack((h_rewards, np.array(rewards)))

        if (i + 1) % 500 == 0 and pkl_file is not None:
            savedict = {
                'h_rewards': h_rewards,
                'h_actions': h_actions,
                'contexts': algos[0].data_h.contexts,
                'actions': algos[0].data_h.actions,
                'rewards': algos[0].data_h.rewards,
                'start_context': i + 1
            }
            with gfile.Open(pkl_file, 'wb') as outfile:
                pickle.dump(savedict, outfile)

    return h_actions, h_rewards
Пример #2
0
def run_contextual_bandit(context_dim,
                          num_actions,
                          dataset,
                          algos,
                          num_contexts=None):
  """Run a contextual bandit problem on a set of algorithms.

  Args:
    context_dim: Dimension of the context.
    num_actions: Number of available actions.
    dataset: Matrix where every row is a context + num_actions rewards.
    algos: List of algorithms to use in the contextual bandit instance.
    num_contexts: Number of contexts.

  Returns:
    h_actions: Matrix with actions: size (num_context, num_algorithms).
    h_rewards: Matrix with rewards: size (num_context, num_algorithms).
  """
  if num_contexts is None:
    num_contexts = dataset.shape[0]

  # Create contextual bandit
  cmab = contextual_bandit.ContextualBandit(context_dim, num_actions)
  cmab.feed_data(dataset)

  h_actions = np.empty((0, len(algos)), float)
  h_rewards = np.empty((0, len(algos)), float)

  # Run the contextual bandit process
  for i in range(num_contexts):
    context = cmab.context(i)
    actions = [a.action(context) for a in algos]
    rewards = [cmab.reward(i, action) for action in actions]

    for j, a in enumerate(algos):
      a.update(context, actions[j], rewards[j])

    h_actions = np.vstack((h_actions, np.array(actions)))
    h_rewards = np.vstack((h_rewards, np.array(rewards)))

  return h_actions, h_rewards