Exemplo n.º 1
0
class PosteriorBNNSampling(BanditAlgorithm):
  """Posterior Sampling algorithm based on a Bayesian neural network."""

  def __init__(self, name, hparams, bnn_model='RMSProp'):
    """Creates a PosteriorBNNSampling object based on a specific optimizer.

    The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
    The Bayesian Network keeps the posterior based on the optimizer iterations.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
      bnn_model: Type of BNN. By default RMSProp (point estimate).
    """

    self.name = name
    self.hparams = hparams
    self.optimizer_n = hparams.optimizer

    self.training_freq = hparams.training_freq
    self.training_epochs = hparams.training_epochs
    self.t = 0
    self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
                                    hparams.buffer_s)

    # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
    bnn_name = '{}-bnn'.format(name)
    if bnn_model == 'Variational':
      self.bnn = VariationalNeuralBanditModel(hparams, bnn_name)
    elif bnn_model == 'AlphaDiv':
      self.bnn = BBAlphaDivergence(hparams, bnn_name)
    elif bnn_model == 'Variational_BF':
      self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name)
    elif bnn_model == 'GP':
      self.bnn = MultitaskGP(hparams)
    else:
      self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)

  def action(self, context):
    """Selects action for context based on Thompson Sampling using the BNN."""

    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      # round robin until each action has been taken "initial_pulls" times
      return self.t % self.hparams.num_actions

    with self.bnn.graph.as_default():
      c = context.reshape((1, self.hparams.context_dim))
      output = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c})
      return np.argmax(output)

  def update(self, context, action, reward):
    """Updates data buffer, and re-trains the BNN every training_freq steps."""

    self.t += 1
    self.data_h.add(context, action, reward)

    if self.t % self.training_freq == 0:
      if self.hparams.reset_lr:
        self.bnn.assign_lr()
      self.bnn.train(self.data_h, self.training_epochs)
class PosteriorBNNSampling(BanditAlgorithm):
    """Posterior Sampling algorithm based on a Bayesian neural network."""
    def __init__(self, name, hparams, bnn_model='RMSProp'):
        """Creates a PosteriorBNNSampling object based on a specific optimizer.

    The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
    The Bayesian Network keeps the posterior based on the optimizer iterations.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
      bnn_model: Type of BNN. By default RMSProp (point estimate).
    """

        self.name = name
        self.hparams = hparams
        self.optimizer_n = hparams.optimizer

        self.training_freq = hparams.training_freq
        self.training_epochs = hparams.training_epochs
        self.t = 0
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions, hparams.buffer_s)

        # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
        bnn_name = '{}-bnn'.format(name)
        if bnn_model == 'Variational':
            self.bnn = VariationalNeuralBanditModel(hparams, bnn_name)
        elif bnn_model == 'AlphaDiv':
            self.bnn = BBAlphaDivergence(hparams, bnn_name)
        elif bnn_model == 'Variational_BF':
            self.bnn = BfVariationalNeuralBanditModel(hparams, bnn_name)
        elif bnn_model == 'GP':
            self.bnn = MultitaskGP(hparams)
        else:
            self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)

    def action(self, context):
        """Selects action for context based on Thompson Sampling using the BNN."""

        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            # round robin until each action has been taken "initial_pulls" times
            return self.t % self.hparams.num_actions

        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            output = self.bnn.sess.run(self.bnn.y_pred,
                                       feed_dict={self.bnn.x: c})
            return np.argmax(output)

    def update(self, context, action, reward):
        """Updates data buffer, and re-trains the BNN every training_freq steps."""

        self.t += 1
        self.data_h.add(context, action, reward)

        if self.t % self.training_freq == 0:
            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.training_epochs)
class VariationalSampling_v4(BanditAlgorithm):
    """Full Bayesian linear regression on the last layer of a deep neural net."""
    def __init__(self, name, hparams, optimizer='RMS', mode='variational'):

        self.name = name
        self.hparams = hparams
        self.optimizer_n = optimizer

        self.training_freq = hparams.training_freq
        self.training_epochs = hparams.training_epochs
        self.num_actions = hparams.num_actions
        self.t = 0
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions, hparams.buffer_s)

        self.bnn = Variational_v4(optimizer, hparams, '{}-bnn'.format(name))

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly."""

        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        with self.bnn.graph.as_default():
            self.c = context.reshape((1, self.hparams.context_dim))
            y_pred_mu = self.bnn.sess.run(self.bnn.y_pred_mu,
                                          feed_dict={self.bnn.x: self.c})
            r = y_pred_mu.mean(axis=0)
            # print(r)
            return np.argmax(r)

    def update(self, context, action, reward):
        """Updates the posterior using linear bayesian regression formula."""

        self.t += 1

        self.data_h.add(context, action, reward)

        if self.t % self.training_freq == 0:
            self.bnn.train(self.data_h, self.hparams.batch_size,
                           self.hparams.training_epochs, self.t,
                           self.hparams.initial_lr)

    def reward(self, context):
        with self.bnn.graph.as_default():
            self.c = context.reshape((1, self.hparams.context_dim))
            y_pred_mu = self.bnn.sess.run(self.bnn.y_pred_mu,
                                          feed_dict={self.bnn.x: self.c})
            r = y_pred_mu[0]
            return r
Exemplo n.º 4
0
class NeuralLinearPosteriorSampling(BanditAlgorithm):
  """Full Bayesian linear regression on the last layer of a deep neural net."""

  def __init__(self, name, hparams, optimizer='RMS'):

    self.name = name
    self.hparams = hparams
    self.latent_dim = self.hparams.layer_sizes[-1]

    # Gaussian prior for each beta_i
    self._lambda_prior = self.hparams.lambda_prior

    self.mu = [
        np.zeros(self.latent_dim)
        for _ in range(self.hparams.num_actions)
    ]

    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim)
                for _ in range(self.hparams.num_actions)]

    self.precision = [
        self.lambda_prior * np.eye(self.latent_dim)
        for _ in range(self.hparams.num_actions)
    ]

    # Inverse Gamma prior for each sigma2_i
    self._a0 = self.hparams.a0
    self._b0 = self.hparams.b0

    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
    self.b = [self._b0 for _ in range(self.hparams.num_actions)]

    # Regression and NN Update Frequency
    self.update_freq_lr = hparams.training_freq
    self.update_freq_nn = hparams.training_freq_network

    self.t = 0
    self.optimizer_n = optimizer

    self.num_epochs = hparams.training_epochs
    self.data_h = ContextualDataset(hparams.context_dim,
                                    hparams.num_actions,
                                    intercept=False)
    self.latent_h = ContextualDataset(self.latent_dim,
                                      hparams.num_actions,
                                      intercept=False)
    self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))

  def action(self, context):
    """Samples beta's from posterior, and chooses best action accordingly."""

    # Round robin until each action has been selected "initial_pulls" times
    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      return self.t % self.hparams.num_actions

    # Sample sigma2, and beta conditional on sigma2
    sigma2_s = [
        self.b[i] * invgamma.rvs(self.a[i])
        for i in range(self.hparams.num_actions)
    ]

    try:
      beta_s = [
          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
          for i in range(self.hparams.num_actions)
      ]
    except np.linalg.LinAlgError as e:
      # Sampling could fail if covariance is not positive definite
      print('Exception when sampling for {}.'.format(self.name))
      print('Details: {} | {}.'.format(e.message, e.args))
      d = self.latent_dim
      beta_s = [
          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
          for i in range(self.hparams.num_actions)
      ]

    # Compute last-layer representation for the current context
    with self.bnn.graph.as_default():
      c = context.reshape((1, self.hparams.context_dim))
      z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})

    # Apply Thompson Sampling to last-layer representation
    vals = [
        np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
    ]
    return np.argmax(vals)

  def update(self, context, action, reward):
    """Updates the posterior using linear bayesian regression formula."""

    self.t += 1
    self.data_h.add(context, action, reward)
    c = context.reshape((1, self.hparams.context_dim))
    z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
    self.latent_h.add(z_context, action, reward)

    # Retrain the network on the original data (data_h)
    if self.t % self.update_freq_nn == 0:

      if self.hparams.reset_lr:
        self.bnn.assign_lr()
      self.bnn.train(self.data_h, self.num_epochs)

      # Update the latent representation of every datapoint collected so far
      new_z = self.bnn.sess.run(self.bnn.nn,
                                feed_dict={self.bnn.x: self.data_h.contexts})
      self.latent_h.replace_data(contexts=new_z)

    # Update the Bayesian Linear Regression
    if self.t % self.update_freq_lr == 0:

      # Find all the actions to update
      actions_to_update = self.latent_h.actions[:-self.update_freq_lr]

      for action_v in np.unique(actions_to_update):

        # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
        z, y = self.latent_h.get_data(action_v)

        # The algorithm could be improved with sequential formulas (cheaper)
        s = np.dot(z.T, z)

        # Some terms are removed as we assume prior mu_0 = 0.
        precision_a = s + self.lambda_prior * np.eye(self.latent_dim)
        cov_a = np.linalg.inv(precision_a)
        mu_a = np.dot(cov_a, np.dot(z.T, y))

        # Inverse Gamma posterior update
        a_post = self.a0 + z.shape[0] / 2.0
        b_upd = 0.5 * np.dot(y.T, y)
        b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a))
        b_post = self.b0 + b_upd

        # Store new posterior distributions
        self.mu[action_v] = mu_a
        self.cov[action_v] = cov_a
        self.precision[action_v] = precision_a
        self.a[action_v] = a_post
        self.b[action_v] = b_post

  @property
  def a0(self):
    return self._a0

  @property
  def b0(self):
    return self._b0

  @property
  def lambda_prior(self):
    return self._lambda_prior
class LinearFullPosteriorSampling(BanditAlgorithm):
    """Thompson Sampling with independent linear models and unknown noise var."""
    def __init__(self, name, hparams):
        """Initialize posterior distributions and hyperparameters.

    Assume a linear model for each action i: reward = context^T beta_i + noise
    Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise
    level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance,
    and precision matrices are initialized, and the ContextualDataset created.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
    """

        self.name = name
        self.hparams = hparams

        # Gaussian prior for each beta_i
        self._lambda_prior = self.hparams.lambda_prior

        self.mu = [
            np.zeros(self.hparams.context_dim + 1)
            for _ in range(self.hparams.num_actions)
        ]

        self.cov = [
            (1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1)
            for _ in range(self.hparams.num_actions)
        ]

        self.precision = [
            self.lambda_prior * np.eye(self.hparams.context_dim + 1)
            for _ in range(self.hparams.num_actions)
        ]

        # Inverse Gamma prior for each sigma2_i
        self._a0 = self.hparams.a0
        self._b0 = self.hparams.b0

        self.a = [self._a0 for _ in range(self.hparams.num_actions)]
        self.b = [self._b0 for _ in range(self.hparams.num_actions)]

        self.t = 0
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        intercept=True)

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly.

    Args:
      context: Context for which the action need to be chosen.

    Returns:
      action: Selected action for the context.
    """

        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        # Sample sigma2, and beta conditional on sigma2
        sigma2_s = [
            self.b[i] * invgamma.rvs(self.a[i])
            for i in range(self.hparams.num_actions)
        ]

        try:
            beta_s = [
                np.random.multivariate_normal(self.mu[i],
                                              sigma2_s[i] * self.cov[i])
                for i in range(self.hparams.num_actions)
            ]
        except np.linalg.LinAlgError as e:
            # Sampling could fail if covariance is not positive definite
            print('Exception when sampling from {}.'.format(self.name))
            print('Details: {} | {}.'.format(e.message, e.args))
            d = self.hparams.context_dim + 1
            beta_s = [
                np.random.multivariate_normal(np.zeros((d)), np.eye(d))
                for i in range(self.hparams.num_actions)
            ]

        # Compute sampled expected values, intercept is last component of beta
        vals = [
            np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1]
            for i in range(self.hparams.num_actions)
        ]

        return np.argmax(vals)

    def update(self, context, action, reward):
        """Updates action posterior using the linear Bayesian regression formula.

    Args:
      context: Last observed context.
      action: Last observed action.
      reward: Last observed reward.
    """

        self.t += 1
        self.data_h.add(context, action, reward)

        # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q)
        x, y = self.data_h.get_data(action)

        # The algorithm could be improved with sequential update formulas (cheaper)
        s = np.dot(x.T, x)

        # Some terms are removed as we assume prior mu_0 = 0.
        precision_a = s + self.lambda_prior * np.eye(self.hparams.context_dim +
                                                     1)
        cov_a = np.linalg.inv(precision_a)
        mu_a = np.dot(cov_a, np.dot(x.T, y))

        # Inverse Gamma posterior update
        a_post = self.a0 + x.shape[0] / 2.0
        b_upd = 0.5 * (np.dot(y.T, y) -
                       np.dot(mu_a.T, np.dot(precision_a, mu_a)))
        b_post = self.b0 + b_upd

        # Store new posterior distributions
        self.mu[action] = mu_a
        self.cov[action] = cov_a
        self.precision[action] = precision_a
        self.a[action] = a_post
        self.b[action] = b_post

    @property
    def a0(self):
        return self._a0

    @property
    def b0(self):
        return self._b0

    @property
    def lambda_prior(self):
        return self._lambda_prior
Exemplo n.º 6
0
class NeuralUCBSampling(BanditAlgorithm):
    """UCB Sampling algorithm based on a neural network."""
    def __init__(self, name, hparams, bnn_model='RMSProp', optimizer='RMS'):
        """Creates a PosteriorBNNSampling object based on a specific optimizer.

    The algorithm has two basic tools: an Approx BNN and a Contextual Dataset.
    The Bayesian Network keeps the posterior based on the optimizer iterations.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
      bnn_model: Type of BNN. By default RMSProp (point estimate).
    """

        self.name = name
        self.hparams = hparams
        self.optimizer_n = hparams.optimizer

        self.training_freq = hparams.training_freq
        self.training_epochs = hparams.training_epochs
        self.t = 0
        self.gamma = 0

        self.bonus = np.zeros(hparams.num_actions)
        self.C1 = 0.001
        self.C2 = 0.001
        self.C3 = 0.00001
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions, hparams.buffer_s)

        # to be extended with more BNNs (BB alpha-div, GPs, SGFS, constSGD...)
        bnn_name = '{}-ucb'.format(name)
        self.bnn = NeuralBanditModel(self.optimizer_n, hparams, bnn_name)
        self.p = (hparams.context_dim + 1) * (hparams.layer_sizes[0]) + (
            hparams.layer_sizes[0] + 1) * (hparams.layer_sizes[0]) * (
                len(hparams.layer_sizes) - 1) + (hparams.layer_sizes[0] +
                                                 1) * hparams.num_actions
        self.Zinv = (1 / hparams.lamb) * np.eye(self.p)
        self.detZ = hparams.lamb**self.p

    def action(self, context):
        """Selects action for context based on UCB using the NN."""

        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            # round robin until each action has been taken "initial_pulls" times
            return self.t % self.hparams.num_actions

        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            output = self.bnn.sess.run(self.bnn.y_pred,
                                       feed_dict={self.bnn.x: c})

            ### Add confidence bound to outbut²
            listTensorGradients = self.bnn.sess.run(self.bnn.gradAction,
                                                    feed_dict={self.bnn.x: c})
            bonus = []
            for act in range(self.hparams.num_actions):
                grads = np.array([])
                for el in listTensorGradients[act]:
                    grads = np.concatenate((grads, el.flatten()))
                bonus.append(self.gamma * np.sqrt(
                    grads.dot(self.Zinv.dot(grads)) /
                    self.hparams.layer_sizes[0]))
            output += np.array(bonus)
            print("Bonus of the actions", bonus)
            print("Gamma", self.gamma)

            return np.argmax(output)

    def update(self, context, action, reward):
        """Updates data buffer, and re-trains the BNN every training_freq steps."""

        self.t += 1
        self.data_h.add(context, action, reward)

        if self.t % self.training_freq == 0:
            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.training_epochs)

        tensorGradients = self.bnn.sess.run(
            self.bnn.gradAction[action],
            feed_dict={self.bnn.x: context.reshape(1, -1)})
        grads = np.array([])
        for el in tensorGradients:
            grads = np.concatenate((grads, el.flatten()))

        outer = np.outer(grads, grads) / self.hparams.layer_sizes[0]
        self.detZ *= 1 + grads.dot(
            self.Zinv.dot(grads)) / self.hparams.layer_sizes[0]
        self.Zinv -= self.Zinv.dot(outer.dot(self.Zinv)) / (
            1 +
            (grads.T.dot(self.Zinv.dot(grads)) / self.hparams.layer_sizes[0]))

        el1 = np.sqrt(1 + self.C1 * ((self.hparams.layer_sizes[0])**(-1 / 6)) *
                      np.sqrt(np.log(self.hparams.layer_sizes[0])) *
                      (len(self.hparams.layer_sizes)**4) * (self.t**(7 / 6)) *
                      (self.hparams.lamb**(-7 / 6)))
        el2 = self.hparams.mu * np.sqrt(
            -np.log(self.detZ / (self.hparams.lamb**self.p)) + self.C2 *
            ((self.hparams.layer_sizes[0])**
             (-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) *
            (len(self.hparams.layer_sizes)**4) * (self.t**(5 / 3)) *
            (self.hparams.lamb**
             (-1 / 6)) - 2 * np.log(self.hparams.delta)) + np.sqrt(
                 self.hparams.lamb) * self.hparams.S
        el3 = self.C3 * (
            (1 - self.hparams.mu * self.hparams.layer_sizes[0] *
             self.hparams.lamb)**
            (self.training_epochs) * np.sqrt(self.t / self.hparams.lamb) +
            ((self.hparams.layer_sizes[0])**
             (-1 / 6)) * np.sqrt(np.log(self.hparams.layer_sizes[0])) *
            (len(self.hparams.layer_sizes)**(7 / 2)) * (self.t**(5 / 3)) *
            (self.hparams.lamb**(-5 / 3)) *
            (1 + np.sqrt(self.t / self.hparams.lamb)))
        print("Profile Elements", el1, el2, el3)
        self.gamma = el1 * el2 + el3
Exemplo n.º 7
0
class LinearFullPosteriorSampling(BanditAlgorithm):
    """Thompson Sampling with independent linear models and unknown noise var."""
    def __init__(self, name, hparams):
        """Initialize posterior distributions and hyperparameters.

    Assume a linear model for each action i: reward = context^T beta_i + noise
    Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise
    level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance,
    and precision matrices are initialized, and the ContextualDataset created.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
    """

        self.name = name
        self.hparams = hparams

        # Gaussian prior for each beta_i
        self._lambda_prior = self.hparams.lambda_prior

        self.mu = [
            np.zeros(self.hparams.context_dim + 1)
            for _ in range(self.hparams.num_actions)
        ]
        self.f = [
            np.zeros(self.hparams.context_dim + 1)
            for _ in range(self.hparams.num_actions)
        ]
        self.yy = [0 for _ in range(self.hparams.num_actions)]
        self.cov = [
            (1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1)
            for _ in range(self.hparams.num_actions)
        ]

        self.precision = [
            self.lambda_prior * np.eye(self.hparams.context_dim + 1)
            for _ in range(self.hparams.num_actions)
        ]

        # Inverse Gamma prior for each sigma2_i
        self._a0 = self.hparams.a0
        self._b0 = self.hparams.b0

        self.a = [self._a0 for _ in range(self.hparams.num_actions)]
        self.b = [self._b0 for _ in range(self.hparams.num_actions)]

        self.t = 0
        self.intercept = True
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        intercept=self.intercept)

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly.

    Args:
      context: Context for which the action need to be chosen.

    Returns:
      action: Selected action for the context.
    """

        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        # Sample sigma2, and beta conditional on sigma2
        sigma2_s = [
            self.b[i] * invgamma.rvs(self.a[i])
            for i in range(self.hparams.num_actions)
        ]

        try:
            beta_s = [
                np.random.multivariate_normal(self.mu[i],
                                              sigma2_s[i] * self.cov[i])
                for i in range(self.hparams.num_actions)
            ]
        except np.linalg.LinAlgError as e:
            # Sampling could fail if covariance is not positive definite

            d = self.hparams.context_dim + 1
            beta_s = [
                np.random.multivariate_normal(np.zeros((d)), np.eye(d))
                for i in range(self.hparams.num_actions)
            ]

        # Compute sampled expected values, intercept is last component of beta
        vals = [
            np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1]
            for i in range(self.hparams.num_actions)
        ]

        return np.argmax(vals)

    def update(self, context, action, reward):
        """Updates action posterior using the linear Bayesian regression formula.

    Args:
      context: Last observed context.
      action: Last observed action.
      reward: Last observed reward.
    """

        self.t += 1
        self.data_h.add(context, action, reward)
        if self.intercept:
            c = np.array(context[:])
            c = np.append(c, 1.0).reshape((1, self.hparams.context_dim + 1))
        else:
            c = np.array(context[:]).reshape((1, self.hparams.context_dim))
        # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q)
        #x, y = self.data_h.get_data(action)

        # Some terms are removed as we assume prior mu_0 = 0.
        self.precision[action] += np.dot(c.T, c)
        self.f[action] += (c.T * reward)[:, 0]
        self.yy[action] += reward**2
        self.cov[action] = np.linalg.inv(self.precision[action])
        self.mu[action] = np.dot(self.cov[action], self.f[action])

        # Inverse Gamma posterior update
        self.a[action] += 0.5
        b_upd = 0.5 * (self.yy[action] -
                       np.dot(self.mu[action].T,
                              np.dot(self.precision[action], self.mu[action])))
        self.b[action] = self.b0 + b_upd

        #print(self.calc_model_evidence())
    @property
    def a0(self):
        return self._a0

    @property
    def b0(self):
        return self._b0

    @property
    def lambda_prior(self):
        return self._lambda_prior

    def calc_model_evidence(self):
        vval = 0
        mp.mp.dps = 50
        for action in range(self.hparams.num_actions):
            #  val=1
            #  aa = self.a[action]
            #  for i in range(int(self.a[action]-self.a0)):
            #      aa-=1
            #      val*=aa
            #      val/=(2.0*math.pi)
            #      val/=self.b[action]
            #  val*=gamma(aa)
            #  val/=(self.b[action]**aa)
            #  val *= np.sqrt(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)) / np.linalg.det(self.precision[action]))
            #  val *= (self.b0 ** self.a0)
            #  val/= gamma(self.a0)
            #  vval += val
            #val= 1/float((2.0 * math.pi) ** (self.a[action]-self.a0))
            #val*= (float(gamma(self.a[action]))/float(gamma(self.a0)))
            #val*= np.sqrt(float(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)))/float(np.linalg.det(self.precision[action])))
            #val*= (float(self.b0**self.a0)/float(self.b[action]**self.a[action]))
            val = mp.mpf(
                mp.fmul(mp.fneg(mp.log(mp.fmul(2.0, mp.pi))),
                        mp.fsub(self.a[action], self.a0)))
            val += mp.loggamma(self.a[action])
            val -= mp.loggamma(self.a0)
            val += 0.5 * mp.log(
                np.linalg.det(
                    self.lambda_prior * np.eye(self.hparams.context_dim + 1)))
            val -= 0.5 * mp.log(np.linalg.det(self.precision[action]))
            val += mp.fmul(self.a0, mp.log(self.b0))
            val -= mp.fmul(self.a[action], mp.log(self.b[action]))
            vval += mp.exp(val)

        vval /= float(self.hparams.num_actions)

        return vval
Exemplo n.º 8
0
class NeuralLinearPosteriorSampling(BanditAlgorithm):
  """Full Bayesian linear regression on the last layer of a deep neural net."""

  def __init__(self, name, hparams, optimizer='RMS'):

    self.name = name
    self.hparams = hparams
    self.latent_dim = self.hparams.layer_sizes[-1]

    # Gaussian prior for each beta_i
    self._lambda_prior = self.hparams.lambda_prior

    self.mu = [
        np.zeros(self.latent_dim)
        for _ in range(self.hparams.num_actions)
    ]

    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.latent_dim)
                for _ in range(self.hparams.num_actions)]

    self.precision = [
        self.lambda_prior * np.eye(self.latent_dim)
        for _ in range(self.hparams.num_actions)
    ]

    # Inverse Gamma prior for each sigma2_i
    self._a0 = self.hparams.a0
    self._b0 = self.hparams.b0

    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
    self.b = [self._b0 for _ in range(self.hparams.num_actions)]

    # Regression and NN Update Frequency
    self.update_freq_lr = hparams.training_freq
    self.update_freq_nn = hparams.training_freq_network

    self.t = 0
    self.optimizer_n = optimizer

    self.num_epochs = hparams.training_epochs
    self.data_h = ContextualDataset(hparams.context_dim,
                                    hparams.num_actions,
                                    intercept=False)
    self.latent_h = ContextualDataset(self.latent_dim,
                                      hparams.num_actions,
                                      intercept=False)
    self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))

  def action(self, context):
    """Samples beta's from posterior, and chooses best action accordingly."""

    # Round robin until each action has been selected "initial_pulls" times
    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      return self.t % self.hparams.num_actions

    # Sample sigma2, and beta conditional on sigma2
    sigma2_s = [
        self.b[i] * invgamma.rvs(self.a[i])
        for i in range(self.hparams.num_actions)
    ]

    try:
      beta_s = [
          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
          for i in range(self.hparams.num_actions)
      ]
    except np.linalg.LinAlgError as e:
      # Sampling could fail if covariance is not positive definite
      print('Exception when sampling for {}.'.format(self.name))
      print('Details: {} | {}.'.format(e.message, e.args))
      d = self.latent_dim
      beta_s = [
          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
          for i in range(self.hparams.num_actions)
      ]

    # Compute last-layer representation for the current context
    with self.bnn.graph.as_default():
      c = context.reshape((1, self.hparams.context_dim))
      z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})

    # Apply Thompson Sampling to last-layer representation
    vals = [
        np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
    ]
    return np.argmax(vals)

  def update(self, context, action, reward):
    """Updates the posterior using linear bayesian regression formula."""

    self.t += 1
    self.data_h.add(context, action, reward)
    c = context.reshape((1, self.hparams.context_dim))
    z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
    self.latent_h.add(z_context, action, reward)

    # Retrain the network on the original data (data_h)
    if self.t % self.update_freq_nn == 0:

      if self.hparams.reset_lr:
        self.bnn.assign_lr()
      self.bnn.train(self.data_h, self.num_epochs)

      # Update the latent representation of every datapoint collected so far
      new_z = self.bnn.sess.run(self.bnn.nn,
                                feed_dict={self.bnn.x: self.data_h.contexts})
      self.latent_h.replace_data(contexts=new_z)

    # Update the Bayesian Linear Regression
    if self.t % self.update_freq_lr == 0:

      # Find all the actions to update
      actions_to_update = self.latent_h.actions[:-self.update_freq_lr]

      for action_v in np.unique(actions_to_update):

        # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
        z, y = self.latent_h.get_data(action_v)

        # The algorithm could be improved with sequential formulas (cheaper)
        s = np.dot(z.T, z)

        # Some terms are removed as we assume prior mu_0 = 0.
        precision_a = s + self.lambda_prior * np.eye(self.latent_dim)
        cov_a = np.linalg.inv(precision_a)
        mu_a = np.dot(cov_a, np.dot(z.T, y))

        # print('beta_cov: ', cov_a)

        # Inverse Gamma posterior update
        a_post = self.a0 + z.shape[0] / 2.0
        b_upd = 0.5 * np.dot(y.T, y)
        b_upd -= 0.5 * np.dot(mu_a.T, np.dot(precision_a, mu_a))
        b_post = self.b0 + b_upd

        # Store new posterior distributions
        self.mu[action_v] = mu_a
        self.cov[action_v] = cov_a
        self.precision[action_v] = precision_a
        self.a[action_v] = a_post
        self.b[action_v] = b_post

  @property
  def a0(self):
    return self._a0

  @property
  def b0(self):
    return self._b0

  @property
  def lambda_prior(self):
    return self._lambda_prior
class LinearFullPosteriorSampling(BanditAlgorithm):
  """Thompson Sampling with independent linear models and unknown noise var."""

  def __init__(self, name, hparams):
    """Initialize posterior distributions and hyperparameters.

    Assume a linear model for each action i: reward = context^T beta_i + noise
    Each beta_i has a Gaussian prior (lambda parameter), each sigma2_i (noise
    level) has an inverse Gamma prior (a0, b0 parameters). Mean, covariance,
    and precision matrices are initialized, and the ContextualDataset created.

    Args:
      name: Name of the algorithm.
      hparams: Hyper-parameters of the algorithm.
    """

    self.name = name
    self.hparams = hparams

    # Gaussian prior for each beta_i
    self._lambda_prior = self.hparams.lambda_prior

    self.mu = [
        np.zeros(self.hparams.context_dim + 1)
        for _ in range(self.hparams.num_actions)
    ]

    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.hparams.context_dim + 1)
                for _ in range(self.hparams.num_actions)]

    self.precision = [
        self.lambda_prior * np.eye(self.hparams.context_dim + 1)
        for _ in range(self.hparams.num_actions)
    ]

    # Inverse Gamma prior for each sigma2_i
    self._a0 = self.hparams.a0
    self._b0 = self.hparams.b0

    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
    self.b = [self._b0 for _ in range(self.hparams.num_actions)]

    self.t = 0
    self.data_h = ContextualDataset(hparams.context_dim,
                                    hparams.num_actions,
                                    intercept=True)

  def action(self, context):
    """Samples beta's from posterior, and chooses best action accordingly.

    Args:
      context: Context for which the action need to be chosen.

    Returns:
      action: Selected action for the context.
    """

    # Round robin until each action has been selected "initial_pulls" times
    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      return self.t % self.hparams.num_actions

    # Sample sigma2, and beta conditional on sigma2
    sigma2_s = [
        self.b[i] * invgamma.rvs(self.a[i])
        for i in range(self.hparams.num_actions)
    ]

    try:
      beta_s = [
          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
          for i in range(self.hparams.num_actions)
      ]
    except np.linalg.LinAlgError as e:
      # Sampling could fail if covariance is not positive definite
      print('Exception when sampling from {}.'.format(self.name))
      print('Details: {} | {}.'.format(e.message, e.args))
      d = self.hparams.context_dim + 1
      beta_s = [
          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
          for i in range(self.hparams.num_actions)
      ]

    # Compute sampled expected values, intercept is last component of beta
    vals = [
        np.dot(beta_s[i][:-1], context.T) + beta_s[i][-1]
        for i in range(self.hparams.num_actions)
    ]

    return np.argmax(vals)

  def update(self, context, action, reward):
    """Updates action posterior using the linear Bayesian regression formula.

    Args:
      context: Last observed context.
      action: Last observed action.
      reward: Last observed reward.
    """

    self.t += 1
    self.data_h.add(context, action, reward)

    # Update posterior of action with formulas: \beta | x,y ~ N(mu_q, cov_q)
    x, y = self.data_h.get_data(action)

    # The algorithm could be improved with sequential update formulas (cheaper)
    s = np.dot(x.T, x)

    # Some terms are removed as we assume prior mu_0 = 0.
    precision_a = s + self.lambda_prior * np.eye(self.hparams.context_dim + 1)
    cov_a = np.linalg.inv(precision_a)
    mu_a = np.dot(cov_a, np.dot(x.T, y))

    # Inverse Gamma posterior update
    a_post = self.a0 + x.shape[0] / 2.0
    b_upd = 0.5 * (np.dot(y.T, y) - np.dot(mu_a.T, np.dot(precision_a, mu_a)))
    b_post = self.b0 + b_upd

    # Store new posterior distributions
    self.mu[action] = mu_a
    self.cov[action] = cov_a
    self.precision[action] = precision_a
    self.a[action] = a_post
    self.b[action] = b_post

  @property
  def a0(self):
    return self._a0

  @property
  def b0(self):
    return self._b0

  @property
  def lambda_prior(self):
    return self._lambda_prior
Exemplo n.º 10
0
class ParameterNoiseSampling(BanditAlgorithm):
    """Parameter Noise Sampling algorithm based on adding noise to net params.

  Described in https://arxiv.org/abs/1706.01905
  """
    def __init__(self, name, hparams):
        """Creates the algorithm, and sets up the adaptive Gaussian noise."""

        self.name = name
        self.hparams = hparams
        self.verbose = getattr(self.hparams, 'verbose', True)
        self.noise_std = getattr(self.hparams, 'noise_std', 0.005)
        self.eps = getattr(self.hparams, 'eps', 0.05)
        self.d_samples = getattr(self.hparams, 'd_samples', 300)
        self.optimizer = getattr(self.hparams, 'optimizer', 'RMS')

        # keep track of noise heuristic statistics
        self.std_h = [self.noise_std]
        self.eps_h = [self.eps]
        self.kl_h = []
        self.t = 0

        self.freq_update = hparams.training_freq
        self.num_epochs = hparams.training_epochs

        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions, hparams.buffer_s)
        self.bnn = NeuralBanditModel(self.optimizer, hparams,
                                     '{}-bnn'.format(name))

        with self.bnn.graph.as_default():

            # noise-injection std placeholder
            self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=())

            # create noise corruption op; adds noise to all weights
            tvars = tf.trainable_variables()
            self.bnn.noisy_grads = [
                tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph)
                for v in tvars
            ]

            # add noise to all params, then compute prediction, then subtract.
            with tf.control_dependencies(self.bnn.noisy_grads):
                self.bnn.noise_add_ops = [
                    tvars[i].assign_add(n)
                    for i, n in enumerate(self.bnn.noisy_grads)
                ]
                with tf.control_dependencies(self.bnn.noise_add_ops):
                    # we force the prediction for 'y' to be recomputed after adding noise
                    self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass(
                    )

                    self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val)
                    with tf.control_dependencies(
                        [tf.identity(self.bnn.noisy_pred)]):
                        self.bnn.noise_sub_ops = [
                            tvars[i].assign_add(-n)
                            for i, n in enumerate(self.bnn.noisy_grads)
                        ]

    def action(self, context):
        """Selects action based on Thompson Sampling *after* adding noise."""

        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            # round robin until each action has been taken "initial_pulls" times
            return self.t % self.hparams.num_actions

        with self.bnn.graph.as_default():
            # run noise prediction op to choose action, and subtract noise op after.
            c = context.reshape((1, self.hparams.context_dim))
            output, _ = self.bnn.sess.run(
                [self.bnn.noisy_pred, self.bnn.noise_sub_ops],
                feed_dict={
                    self.bnn.x: c,
                    self.bnn.noise_std_ph: self.noise_std
                })
            return np.argmax(output)

    def update(self, context, action, reward):
        """Updates the data buffer, and re-trains the BNN and noise level."""

        self.t += 1
        self.data_h.add(context, action, reward)

        if self.t % self.freq_update == 0:
            self.bnn.train(self.data_h, self.num_epochs)
            self.update_noise()

    def update_noise(self):
        """Increase noise if distance btw original and corrupted distrib small."""

        kl = self.compute_distance()
        delta = -np.log1p(-self.eps + self.eps / self.hparams.num_actions)

        if kl < delta:
            self.noise_std *= 1.01
        else:
            self.noise_std /= 1.01

        self.eps *= 0.99

        if self.verbose:
            print('Update eps={} | kl={} | std={} | delta={} | increase={}.'.
                  format(self.eps, kl, self.noise_std, delta, kl < delta))

        # store noise-injection statistics for inspection: std, KL, eps.
        self.std_h.append(self.noise_std)
        self.kl_h.append(kl)
        self.eps_h.append(self.eps)

    def compute_distance(self):
        """Computes empirical KL for original and corrupted output distributions."""

        random_inputs, _ = self.data_h.get_batch(self.d_samples)
        y_model = self.bnn.sess.run(self.bnn.y_pred,
                                    feed_dict={
                                        self.bnn.x: random_inputs,
                                        self.bnn.noise_std_ph: self.noise_std
                                    })
        y_noisy, _ = self.bnn.sess.run(
            [self.bnn.noisy_pred, self.bnn.noise_sub_ops],
            feed_dict={
                self.bnn.x: random_inputs,
                self.bnn.noise_std_ph: self.noise_std
            })

        if self.verbose:
            # display how often original & perturbed models propose different actions
            s = np.sum([
                np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :])
                for i in range(y_model.shape[0])
            ])
            print('{} | % of agreement btw original / corrupted actions: {}.'.
                  format(self.name, s / self.d_samples))

        kl = self.compute_kl_with_logits(y_model, y_noisy)
        return kl

    def compute_kl_with_logits(self, logits1, logits2):
        """Computes KL from logits samples from two distributions."""
        def exp_times_diff(a, b):
            return np.multiply(np.exp(a), a - b)

        logsumexp1 = logsumexp(logits1, axis=1)
        logsumexp2 = logsumexp(logits2, axis=1)
        logsumexp_diff = logsumexp2 - logsumexp1

        exp_diff = exp_times_diff(logits1, logits2)
        exp_diff = np.sum(exp_diff, axis=1)

        inv_exp_sum = np.sum(np.exp(logits1), axis=1)
        term1 = np.divide(exp_diff, inv_exp_sum)

        kl = term1 + logsumexp_diff
        kl = np.maximum(kl, 0.0)
        kl = np.nan_to_num(kl)
        return np.mean(kl)
Exemplo n.º 11
0
class NeuralLinearEpsilonGreedy(BanditAlgorithm):
    """Full Bayesian linear regression on the last layer of a deep neural net."""
    def __init__(self, name, hparams, textflag='yes', optimizer='RMS'):

        self.name = name
        self.hparams = hparams
        self.epsilon = self.hparams.epsilon
        self.latent_dim = self.hparams.layer_sizes[-1]
        self.intercept = True
        if self.intercept:
            self.param_dim = 1 + self.latent_dim
        else:
            self.param_dim = self.latent_dim
        # Gaussian prior for each beta_i

        # Regression and NN Update Frequency
        self.update_freq_lr = hparams.training_freq
        self.update_freq_nn = hparams.training_freq_network

        self.t = 0
        self.optimizer_n = optimizer

        self.num_epochs = hparams.training_epochs
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        intercept=False)
        self.latent_h = ContextualDataset(self.latent_dim,
                                          hparams.num_actions,
                                          intercept=self.intercept)
        if textflag == 'yes':
            self.bnn = TextCNN('adam', self.hparams.num_actions,
                               self.hparams.batch_size, '{}-bnn'.format(name))
        else:
            self.bnn = NeuralBanditModel(optimizer, hparams,
                                         '{}-bnn'.format(name))

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly."""

        # Round robin until each action has been selected "initial_pulls" times
        if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
            return self.t % self.hparams.num_actions

        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            y = self.bnn.sess.run(self.bnn.y_pred, feed_dict={self.bnn.x: c})
            if random.random() > self.epsilon:
                return np.argmax(y)
            else:
                return random.randrange(self.hparams.num_actions)

    def update(self, context, action, reward):
        """Updates the posterior using linear bayesian regression formula."""

        self.t += 1
        self.data_h.add(context, action, reward)
        c = context.reshape((1, self.hparams.context_dim))
        z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
        self.latent_h.add(z_context, action, reward)

        # Retrain the network on the original data (data_h)
        if self.t % self.update_freq_nn == 0:

            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            #self.bnn.set_last_layer(self.mu)
            self.bnn.train(self.data_h, self.num_epochs)

    @property
    def a0(self):
        return self._a0

    @property
    def b0(self):
        return self._b0

    @property
    def lambda_prior(self):
        return self._lambda_prior
Exemplo n.º 12
0
class NeuralGreedy(BanditAlgorithm):
    """Full Bayesian linear regression on the last layer of a deep neural net."""
    def __init__(self, name, hparams, optimizer='RMS'):

        self.name = name
        self.eps = 0.9
        self.decay = 0.99  # computed for 10,000 steps
        self.hparams = hparams

        # Regression and NN Update Frequency
        self.update_freq_lr = hparams.training_freq
        self.update_freq_nn = hparams.training_freq_network

        self.t = 0
        self.optimizer_n = optimizer

        self.num_epochs = hparams.training_epochs
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        intercept=False)
        self.bnn = NeuralBanditModel(optimizer, hparams,
                                     '{}-greedy'.format(name))

    def action(self, context):
        """Samples beta's from posterior, and chooses best action accordingly."""

        # Round robin until each action has been selected "initial_pulls" times
        #if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
        #return self.t % self.hparams.num_actions ## No need with greedy

        if np.random.random() < self.eps:
            return np.random.choice(range(self.hparams.num_actions))
        else:
            with self.bnn.graph.as_default():
                c = context.reshape((1, self.hparams.context_dim))
                output = self.bnn.sess.run(self.bnn.y_pred,
                                           feed_dict={self.bnn.x: c})
                return np.argmax(output)

    def update(self, context, action, reward):
        """Updates the posterior using linear bayesian regression formula."""

        self.t += 1
        self.eps *= self.decay
        self.data_h.add(context, action, reward)
        c = context.reshape((1, self.hparams.context_dim))

        # Retrain the network on the original data (data_h)
        if self.t % self.update_freq_nn == 0:

            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.num_epochs)

    @property
    def a0(self):
        return self._a0

    @property
    def b0(self):
        return self._b0

    @property
    def lambda_prior(self):
        return self._lambda_prior
Exemplo n.º 13
0
class ParameterNoiseSampling(BanditAlgorithm):
  """Parameter Noise Sampling algorithm based on adding noise to net params.

  Described in https://arxiv.org/abs/1706.01905
  """

  def __init__(self, name, hparams):
    """Creates the algorithm, and sets up the adaptive Gaussian noise."""

    self.name = name
    self.hparams = hparams
    self.verbose = getattr(self.hparams, 'verbose', True)
    self.noise_std = getattr(self.hparams, 'noise_std', 0.005)
    self.eps = getattr(self.hparams, 'eps', 0.05)
    self.d_samples = getattr(self.hparams, 'd_samples', 300)
    self.optimizer = getattr(self.hparams, 'optimizer', 'RMS')

    # keep track of noise heuristic statistics
    self.std_h = [self.noise_std]
    self.eps_h = [self.eps]
    self.kl_h = []
    self.t = 0

    self.freq_update = hparams.training_freq
    self.num_epochs = hparams.training_epochs

    self.data_h = ContextualDataset(hparams.context_dim, hparams.num_actions,
                                    hparams.buffer_s)
    self.bnn = NeuralBanditModel(self.optimizer, hparams, '{}-bnn'.format(name))

    with self.bnn.graph.as_default():

      # noise-injection std placeholder
      self.bnn.noise_std_ph = tf.placeholder(tf.float32, shape=())

      # create noise corruption op; adds noise to all weights
      tvars = tf.trainable_variables()
      self.bnn.noisy_grads = [
          tf.random_normal(v.get_shape(), 0, self.bnn.noise_std_ph)
          for v in tvars
      ]

      # add noise to all params, then compute prediction, then subtract.
      with tf.control_dependencies(self.bnn.noisy_grads):
        self.bnn.noise_add_ops = [
            tvars[i].assign_add(n) for i, n in enumerate(self.bnn.noisy_grads)
        ]
        with tf.control_dependencies(self.bnn.noise_add_ops):
          # we force the prediction for 'y' to be recomputed after adding noise
          self.bnn.noisy_nn, self.bnn.noisy_pred_val = self.bnn.forward_pass()

          self.bnn.noisy_pred = tf.identity(self.bnn.noisy_pred_val)
          with tf.control_dependencies([tf.identity(self.bnn.noisy_pred)]):
            self.bnn.noise_sub_ops = [
                tvars[i].assign_add(-n)
                for i, n in enumerate(self.bnn.noisy_grads)
            ]

  def action(self, context):
    """Selects action based on Thompson Sampling *after* adding noise."""

    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      # round robin until each action has been taken "initial_pulls" times
      return self.t % self.hparams.num_actions

    with self.bnn.graph.as_default():
      # run noise prediction op to choose action, and subtract noise op after.
      c = context.reshape((1, self.hparams.context_dim))
      output, _ = self.bnn.sess.run(
          [self.bnn.noisy_pred, self.bnn.noise_sub_ops],
          feed_dict={self.bnn.x: c,
                     self.bnn.noise_std_ph: self.noise_std})
      return np.argmax(output)

  def update(self, context, action, reward):
    """Updates the data buffer, and re-trains the BNN and noise level."""

    self.t += 1
    self.data_h.add(context, action, reward)

    if self.t % self.freq_update == 0:
      self.bnn.train(self.data_h, self.num_epochs)
      self.update_noise()

  def update_noise(self):
    """Increase noise if distance btw original and corrupted distrib small."""

    kl = self.compute_distance()
    delta = -np.log1p(- self.eps + self.eps / self.hparams.num_actions)

    if kl < delta:
      self.noise_std *= 1.01
    else:
      self.noise_std /= 1.01

    self.eps *= 0.99

    if self.verbose:
      print('Update eps={} | kl={} | std={} | delta={} | increase={}.'.format(
          self.eps, kl, self.noise_std, delta, kl < delta))

    # store noise-injection statistics for inspection: std, KL, eps.
    self.std_h.append(self.noise_std)
    self.kl_h.append(kl)
    self.eps_h.append(self.eps)

  def compute_distance(self):
    """Computes empirical KL for original and corrupted output distributions."""

    random_inputs, _ = self.data_h.get_batch(self.d_samples)
    y_model = self.bnn.sess.run(
        self.bnn.y_pred,
        feed_dict={
            self.bnn.x: random_inputs,
            self.bnn.noise_std_ph: self.noise_std
        })
    y_noisy, _ = self.bnn.sess.run(
        [self.bnn.noisy_pred, self.bnn.noise_sub_ops],
        feed_dict={
            self.bnn.x: random_inputs,
            self.bnn.noise_std_ph: self.noise_std
        })

    if self.verbose:
      # display how often original & perturbed models propose different actions
      s = np.sum([np.argmax(y_model[i, :]) == np.argmax(y_noisy[i, :])
                  for i in range(y_model.shape[0])])
      print('{} | % of agreement btw original / corrupted actions: {}.'.format(
          self.name, s / self.d_samples))

    kl = self.compute_kl_with_logits(y_model, y_noisy)
    return kl

  def compute_kl_with_logits(self, logits1, logits2):
    """Computes KL from logits samples from two distributions."""

    def exp_times_diff(a, b):
      return np.multiply(np.exp(a), a - b)

    logsumexp1 = logsumexp(logits1, axis=1)
    logsumexp2 = logsumexp(logits2, axis=1)
    logsumexp_diff = logsumexp2 - logsumexp1

    exp_diff = exp_times_diff(logits1, logits2)
    exp_diff = np.sum(exp_diff, axis=1)

    inv_exp_sum = np.sum(np.exp(logits1), axis=1)
    term1 = np.divide(exp_diff, inv_exp_sum)

    kl = term1 + logsumexp_diff
    kl = np.maximum(kl, 0.0)
    kl = np.nan_to_num(kl)
    return np.mean(kl)
Exemplo n.º 14
0
class NeuralLinUCB(BanditAlgorithm):

    def __init__(self, name, hparams, optimizer='RMS'):

        self.name = name
        self.hparams = hparams
        self.n_a = self.hparams.num_actions
        self.n_d = self.hparams.layer_sizes[-1]
        self.alpha = self.hparams.alpha
        self.lam = self.hparams.lam

        self.a = np.concatenate(tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) * self.lam
        self.inv_a = np.concatenate(tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]),
                                    axis=0) / self.lam

        self.b = np.zeros((self.n_a, self.n_d))

        self.theta = np.zeros((self.n_a, self.n_d))

        # Params for BNN

        self.update_freq_nn = hparams.training_freq_network

        self.t = 0
        self.optimizer_n = optimizer

        self.num_epochs = hparams.training_epochs
        self.data_h = ContextualDataset(hparams.context_dim,
                                        hparams.num_actions,
                                        bootstrap=getattr(hparams, 'bootstrap', None),
                                        intercept=False)

        self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))

    def action(self, context):
        """

        Args:
          context: Context for which the action need to be chosen.

        Returns:
          action: Selected action for the context.
        """

        with self.bnn.graph.as_default():
            c = context.reshape((1, self.hparams.context_dim))
            z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}).flatten()

        vals = np.array([
            np.dot(self.theta[i], z_context) + self.alpha * np.sqrt(np.dot(z_context, np.dot(self.inv_a[i], z_context)))
            for i in range(self.n_a)])

        return np.argmax(vals)

    def update(self, context, action, reward):
        """Updates action posterior using the linear Bayesian regression formula.

        Args:
          context: Last observed context.
          action: Last observed action.
          reward: Last observed reward.
        """
        self.t += 1
        self.data_h.add(context, action, reward)

        if self.t % self.update_freq_nn == 0:
            if self.hparams.reset_lr:
                self.bnn.assign_lr()
            self.bnn.train(self.data_h, self.num_epochs)

            new_z = self.bnn.sess.run(self.bnn.nn,
                                      feed_dict={self.bnn.x: self.data_h.contexts})
            contexts = new_z
            actions = np.array(self.data_h.actions)
            rewards = self.data_h.rewards[np.arange(actions.shape[
                                                        0]), actions]  # strange but data_h.rewards is of shape (n_samples, n_actions) so we select actions pulled by model

            self.a = np.dot(contexts.T, contexts) + np.concatenate(
                tuple([np.eye(self.n_d)[np.newaxis, :, :] for i in range(self.n_a)]), axis=0) * self.lam
            self.b = np.concatenate(tuple(
                [np.dot(rewards[actions == action], contexts[actions == action])[np.newaxis, :] for action in
                 range(self.n_a)]), axis=0)
            self.inv_a = np.concatenate(
                tuple([np.linalg.inv(self.a[action])[np.newaxis, :, :] for action in range(self.n_a)]), axis=0)
            self.theta = np.concatenate(
                tuple([np.dot(self.inv_a[action], self.b[action])[np.newaxis, :] for action in range(self.n_a)]),
                axis=0)

        else:
            c = context.reshape((1, self.hparams.context_dim))
            z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c}).flatten()

            self.a[action] = self.a[action] + np.tensordot(z_context, z_context, axes=0)
            self.inv_a[action] = np.linalg.inv(self.a[action])
            self.b[action] = self.b[action] + reward * z_context
            self.theta[action] = np.dot(self.inv_a[action], self.b[action])
class NeuralLinearPosteriorSampling(BanditAlgorithm):
  """Full Bayesian linear regression on the last layer of a deep neural net."""

  def __init__(self, name, hparams,textflag ='no', optimizer='RMS'):

    self.name = name
    self.hparams = hparams
    self.latent_dim = self.hparams.layer_sizes[-1]
    self.intercept = False
    if self.intercept:
      self.param_dim=1+self.latent_dim
    else:
      self.param_dim = self.latent_dim
    # Gaussian prior for each beta_i
    self._lambda_prior = self.hparams.lambda_prior

    self.mu = [
        np.zeros(self.param_dim)
        for _ in range(self.hparams.num_actions)
    ]

    self.f = [
      np.zeros(self.param_dim)
      for _ in range(self.hparams.num_actions)
    ]
    self.yy = [0 for _ in range(self.hparams.num_actions)]

    self.cov = [(1.0 / self.lambda_prior) * np.eye(self.param_dim)
                for _ in range(self.hparams.num_actions)]

    self.precision = [
        self.lambda_prior * np.eye(self.param_dim)
        for _ in range(self.hparams.num_actions)
    ]

    # Inverse Gamma prior for each sigma2_i
    self._a0 = self.hparams.a0
    self._b0 = self.hparams.b0

    self.a = [self._a0 for _ in range(self.hparams.num_actions)]
    self.b = [self._b0 for _ in range(self.hparams.num_actions)]

    # Regression and NN Update Frequency
    self.update_freq_lr = hparams.training_freq
    self.update_freq_nn = hparams.training_freq_network

    self.t = 0
    self.optimizer_n = optimizer

    self.num_epochs = hparams.training_epochs
    self.data_h = ContextualDataset(hparams.context_dim,
                                    hparams.num_actions,
                                    intercept=False)
    self.latent_h = ContextualDataset(self.latent_dim,
                                      hparams.num_actions,
                                      intercept=self.intercept)
    if textflag=='yes':
      self.bnn = TextCNN('adam', self.hparams.num_actions,self.hparams.batch_size, '{}-bnn'.format(name))
    else:
      self.bnn = NeuralBanditModel(optimizer, hparams, '{}-bnn'.format(name))



  def action(self, context):
    """Samples beta's from posterior, and chooses best action accordingly."""

    # Round robin until each action has been selected "initial_pulls" times
    if self.t < self.hparams.num_actions * self.hparams.initial_pulls:
      return self.t % self.hparams.num_actions

    # Sample sigma2, and beta conditional on sigma2
    sigma2_s = [
        self.b[i] * invgamma.rvs(self.a[i])
        for i in range(self.hparams.num_actions)
    ]

    try:
      beta_s = [
          np.random.multivariate_normal(self.mu[i], sigma2_s[i] * self.cov[i])
          for i in range(self.hparams.num_actions)
      ]
    except np.linalg.LinAlgError as e:
      # Sampling could fail if covariance is not positive definite

      d = self.param_dim
      beta_s = [
          np.random.multivariate_normal(np.zeros((d)), np.eye(d))
          for i in range(self.hparams.num_actions)
      ]

    # Compute last-layer representation for the current context
    with self.bnn.graph.as_default():
      c = context.reshape((1, self.hparams.context_dim))
      z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
      if self.intercept:
        z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1))
    # Apply Thompson Sampling to last-layer representation
    vals = [
        np.dot(beta_s[i], z_context.T) for i in range(self.hparams.num_actions)
    ]
    return np.argmax(vals)

  def update(self, context, action, reward):
    """Updates the posterior using linear bayesian regression formula."""

    self.t += 1
    self.data_h.add(context, action, reward)
    c = context.reshape((1, self.hparams.context_dim))
    z_context = self.bnn.sess.run(self.bnn.nn, feed_dict={self.bnn.x: c})
    self.latent_h.add(z_context, action, reward)

    # Retrain the network on the original data (data_h)
    if self.t % self.update_freq_nn == 0:

      if self.hparams.reset_lr:
        self.bnn.assign_lr()
      #self.bnn.set_last_layer(self.mu)
      self.bnn.train(self.data_h, self.num_epochs)

      # Update the latent representation of every datapoint collected so far
      new_z = self.bnn.sess.run(self.bnn.nn,
                                feed_dict={self.bnn.x: self.data_h.contexts})
      self.latent_h.replace_data(contexts=new_z)
      for action_v in range(self.hparams.num_actions):

        # Update action posterior with formulas: \beta | z,y ~ N(mu_q, cov_q)
        z, y = self.latent_h.get_data(action_v)

        # The algorithm could be improved with sequential formulas (cheaper)
        self.precision[action_v] = (np.dot(z.T, z)+self.lambda_prior * np.eye(self.param_dim)) #the new PHI_0
        self.f[action_v] = np.dot(z.T, y)
    else:
      if self.intercept:
        z_context = np.append(z_context, 1.0).reshape((1, self.latent_dim + 1))
      self.precision[action] += np.dot(z_context.T, z_context)
      self.f[action] += (z_context.T * reward)[:, 0]
    self.yy[action] += reward ** 2
    self.cov[action] = np.linalg.inv(self.precision[action])
    self.mu[action] = np.dot(self.cov[action], self.f[action])

    # Inverse Gamma posterior update
    self.a[action] += 0.5
    b_upd = 0.5 * (self.yy[action] - np.dot(self.mu[action].T, np.dot(self.precision[action], self.mu[action])))
    self.b[action] = self.b0 + b_upd

    #print(self.calc_model_evidence())

  @property
  def a0(self):
    return self._a0

  @property
  def b0(self):
    return self._b0

  @property
  def lambda_prior(self):
    return self._lambda_prior
  def calc_model_evidence(self):
    vval = 0
    mp.mp.dps = 50
    for action in range(self.hparams.num_actions):
      #  val=1
      #  aa = self.a[action]
      #  for i in range(int(self.a[action]-self.a0)):
      #      aa-=1
      #      val*=aa
      #      val/=(2.0*math.pi)
      #      val/=self.b[action]
      #  val*=gamma(aa)
      #  val/=(self.b[action]**aa)
      #  val *= np.sqrt(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)) / np.linalg.det(self.precision[action]))
      #  val *= (self.b0 ** self.a0)
      #  val/= gamma(self.a0)
      #  vval += val
      #val= 1/float((2.0 * math.pi) ** (self.a[action]-self.a0))
      #val*= (float(gamma(self.a[action]))/float(gamma(self.a0)))
      #val*= np.sqrt(float(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)))/float(np.linalg.det(self.precision[action])))
      #val*= (float(self.b0**self.a0)/float(self.b[action]**self.a[action]))
      val= mp.mpf(mp.fmul(mp.fneg(mp.log(mp.fmul(2.0 , mp.pi))) , mp.fsub(self.a[action],self.a0)))
      val+= mp.loggamma(self.a[action])
      val-= mp.loggamma(self.a0)
      val+= 0.5*mp.log(np.linalg.det(self.lambda_prior * np.eye(self.hparams.context_dim + 1)))
      val -= 0.5*mp.log(np.linalg.det(self.precision[action]))
      val+= mp.fmul(self.a0,mp.log(self.b0))
      val-= mp.fmul(self.a[action],mp.log(self.b[action]))
      vval+=mp.exp(val)


    vval/=float(self.hparams.num_actions)

    return vval