Exemplo n.º 1
0
  def __init__(self, Y_given_X, meta_feature, meta_weight, epsilon=None):
    """Construct a new probabilistic model.

    Args:
      Y_given_X: a function from ``X`` to an iterable object giving the
        subset of ``Y`` which has non-zero probability given the
        ``x``. When in doubt about whether some ``y`` has zero probability
        or not, it is always safe/correct to return a larger subset of
        ``Y`` (it'll just take more computation time is all). This is
        needed for computing the partition function and expectation. N.B.,
        we do not actually need to know/enumerate of *all* of ``Y``,
        only the subsets for each ``x``.
      meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``.
        N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)``
        must be the same as the length of ``weights``.
       All features.
      meta_weight (MetaWeight): All weights. the weights for the features.
        The keys of the dictionary are the names of the feature that weight is
        for. We take this argument as a dict rather than as a list so that
        callers needn't worry about what order to provide the weights in.
      epsilon (float): The absolute-error threshold for considering a
        weight to be "equal to zero". N.B., this should be a positive
        number, as we will compare it against the absolute value of
        each weight.
    """
    super(LogLinearModel, self).__init__(meta_feature, meta_weight, epsilon)

    self._Y = Y_given_X

    def _LogZ(x):
      score_given_x = self._scores(x)
      return logsumexp([score_given_x(y) for y in self._Y(x)])
    self._zetas = MemoizedFunction(_LogZ)
Exemplo n.º 2
0
  def testClearMemos(self):
    """``MemoizedFunction._ClearMemos`` does actually clear the memos.

    That is, we call the underlying function once (to set the memo),
    then swap put the underlying function with a different one (to be
    sure we know whether the next ``__call__`` goes to the memos or to
    the function), and finally clear the memos and check.
    """
    f = MemoizedFunction(_F)
    f(5)
    f._f = _G
    f.ClearMemos()
    self.assertEqual(_G(5), f(5))
Exemplo n.º 3
0
    def __init__(self, meta_feature, meta_weight, epsilon=None):
        """Construct a new model with the meta_feature and meta_weight.

    Args:
      meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``.
        N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)``
        must be the same as the length of ``weights``.
       All features.
      meta_weight (MetaWeight): All weights. the weights for the features.
        The keys of the dictionary are the names of the feature that weight is
        for. We take this argument as a dict rather than as a list so that
        callers needn't worry about what order to provide the weights in.
      epsilon (float): The absolute-error threshold for considering a
        weight to be "equal to zero". N.B., this should be a positive
        number, as we will compare it against the absolute value of
        each weight.
    """
        self._epsilon = EPSILON if epsilon is None else epsilon

        self._meta_weight = meta_weight
        self._meta_weight.DropZeroWeights(self._epsilon)

        self._quadrance = None

        # TODO(crbug.com/674752): we need better names for ``self._features``.
        def _Features(x):
            """Wrap ``meta_feature`` to memoize things.

      This outer wrapping takes each ``x`` to a memoized instance of
      ``_FeaturesGivenX``. That is, for each ``x`` we return a
      ``MemoizedFunction`` from ``Y`` to ``dict(str to FeatureValue)``.
      """
            # Memoize on ``Y``, to ensure we don't need to recompute
            # ``FeatureValue``s nor recheck the lengths.
            return MemoizedFunction(meta_feature(x))

        # Memoize on ``X``, to ensure we share the memo tables on ``Y``.
        self._features = MemoizedFunction(_Features)

        # TODO(crbug.com/674752): we need better names for ``self._scores``.
        # N.B., this is just the inner product of ``self._meta_weight``
        # against ``self._features(x)``. If we can compute this in some
        # more efficient way, we should. In particular, we will want to
        # make the weights sparse, in which case we need to use a sparse
        # variant of the dot product.
        self._scores = MemoizedFunction(lambda x: self._features(x).map(
            lambda fxy: self._meta_weight * fxy))
Exemplo n.º 4
0
    def _Features(x):
      """Wrap ``meta_feature`` to memoize things.

      This outer wrapping takes each ``x`` to a memoized instance of
      ``_FeaturesGivenX``. That is, for each ``x`` we return a
      ``MemoizedFunction`` from ``Y`` to ``dict(str to FeatureValue)``.
      """
      # Memoize on ``Y``, to ensure we don't need to recompute
      # ``FeatureValue``s nor recheck the lengths.
      return MemoizedFunction(meta_feature(x))
Exemplo n.º 5
0
  def testNotToMemorizeIfXIsNotHashable(self):
    """``MemoizedFunction`` won't memorize when x is not hashable.

    We cannot always requir that the input of MemoizedFunction is hashable,
    that requirement would be too strict. So when the x is not hashable,
    we just bail out and run the internal function instead of getting result
    from memo."""
    f = MemoizedFunction(_E)
    y = f({1: 3})
    self.assertEqual(y, 4)
    self.assertEqual(f._memos, {})
Exemplo n.º 6
0
  def testMemoization(self):
    """``MemoizedFunction.__call__`` actually does memoize.

    That is, we call the underlying function once (to set the memo), then
    we discard the underlying function (to be sure the next ``__call__``
    is handled from the memos, and finally call the function to check.
    """
    f = MemoizedFunction(_F)
    f(5)
    del f._f
    self.assertEqual(_F(5), f(5))
Exemplo n.º 7
0
 def testMemoizedFunctionMap(self):
   """``MemoizedFunction.map`` composes functions as described."""
   self.assertEqual(_G(_F(5)), MemoizedFunction(_F).map(_G)(5))
   self.assertEqual(_F(_G(5)), MemoizedFunction(_G).map(_F)(5))
Exemplo n.º 8
0
 def testMemoizedFunctionCall(self):
   """``MemoizedFunction.__call__`` returns same value as its callable."""
   self.assertEqual(_F(5), MemoizedFunction(_F)(5))
   self.assertEqual(_G(5), MemoizedFunction(_G)(5))
Exemplo n.º 9
0
class UnnormalizedLogLinearModel(object):
  """An unnormalized loglinear model.

  We use this model for combining a bunch of different features in order
  to decide who to blame. In particular, the nice thing about loglinear
  models is that (a) they allow separating the amount of weight we
  give to each feature from the feature itself, and (b) they allow us
  to keep adding features at will without needing to worry about the
  fiddly details needed for it to be a probability model.

  Throughout the documentation we use the following conventional
  terminology. The independent variable (aka: the given data which
  sets up the classification problem) is called ``X``, where particular
  values for that variable called ``x``. The dependent variable (aka:
  the answers/labels returned by classification) is called ``Y``,
  where particular values for that random variable called ``y``. The
  partition function is called ``Z``. And (somewhat non-conventionally)
  we will call the log of the partition function ``zeta``.

  This class is distinct from ``LogLinearModel`` in that we do not require
  a specification of ``Y``. This class is sufficient for determining
  which ``y`` is the most likely, though it can only return a "score"
  rather than returning a probability per se.
  """

  def __init__(self, meta_feature, meta_weight, epsilon=None):
    """Construct a new model with the meta_feature and meta_weight.

    Args:
      meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``.
        N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)``
        must be the same as the length of ``weights``.
       All features.
      meta_weight (MetaWeight): All weights. the weights for the features.
        The keys of the dictionary are the names of the feature that weight is
        for. We take this argument as a dict rather than as a list so that
        callers needn't worry about what order to provide the weights in.
      epsilon (float): The absolute-error threshold for considering a
        weight to be "equal to zero". N.B., this should be a positive
        number, as we will compare it against the absolute value of
        each weight.
    """
    self._epsilon = EPSILON if epsilon is None else epsilon

    self._meta_weight = meta_weight
    self._meta_weight.DropZeroWeights(self._epsilon)

    self._quadrance = None
    # TODO(crbug.com/674752): we need better names for ``self._features``.
    def _Features(x):
      """Wrap ``meta_feature`` to memoize things.

      This outer wrapping takes each ``x`` to a memoized instance of
      ``_FeaturesGivenX``. That is, for each ``x`` we return a
      ``MemoizedFunction`` from ``Y`` to ``dict(str to FeatureValue)``.
      """
      # Memoize on ``Y``, to ensure we don't need to recompute
      # ``FeatureValue``s nor recheck the lengths.
      return MemoizedFunction(meta_feature(x))

    # Memoize on ``X``, to ensure we share the memo tables on ``Y``.
    self._features = MemoizedFunction(_Features)

    # TODO(crbug.com/674752): we need better names for ``self._scores``.
    # N.B., this is just the inner product of ``self._meta_weight``
    # against ``self._features(x)``. If we can compute this in some
    # more efficient way, we should. In particular, we will want to
    # make the weights sparse, in which case we need to use a sparse
    # variant of the dot product.
    self._scores = MemoizedFunction(lambda x: self._features(x).map(
        lambda fxy: self._meta_weight * fxy))

  def ClearWeightBasedMemos(self):
    """Clear all the memos that depend on the weight covector."""
    self._quadrance = None
    self._scores.ClearMemos()

  def ClearAllMemos(self):
    """Clear all memos, even those independent of the weight covector."""
    self.ClearWeightBasedMemos()
    self._features.ClearMemos()

  @property
  def meta_weight(self):
    """The weight covector.

    At present we return the weights as an dict mapping feature name to its
    weight, but in the future that may be replaced by a more general type which
    specifies the semantics rather than the implementation details.
    """
    return self._meta_weight

  @meta_weight.setter
  def meta_weight(self, new_meta_weight):
    self._meta_weight = new_meta_weight
    self._meta_weight.DropZeroWeights(self._epsilon)

  @property
  def l0(self): # pragma: no cover
    """The l0-norm of the weight covector.

    N.B., despite being popularly called the "l0-norm", this isn't
    actually a norm in the mathematical sense."""
    return self._meta_weight.l0

  @property
  def l1(self): # pragma: no cover
    """The l1 (aka: Manhattan) norm of the weight covector."""
    return self._meta_weight.l1

  @property
  def quadrance(self):
    """The square of the l2 norm of the weight covector.

    This value is often more helpful to have direct access to, as it
    avoids the need for non-rational functions (e.g., sqrt) and shows up
    as its own quantity in many places. Also, computing it directly avoids
    the error introduced by squaring the square-root of an IEEE-754 float.
    """
    return self._meta_weight.quadrance

  @property
  def l2(self):
    """The l2 (aka Euclidean) norm of the weight covector.

    If you need the square of the l2-norm, do not use this property.
    Instead, use the ``quadrance`` property which is more accurate than
    squaring this one.
    """
    return math.sqrt(self.quadrance)

  # TODO(crbug.com/673964): something better for detecting "close to log(0)".
  def LogZeroish(self, x):
    """Determine whether a float is close enough to log(0).

    If a ``FeatureValue`` has a (log-domain) score of -inf for a given
    ``Suspect``, then that suspect has zero probability of being the
    culprit. We want to filter these suspects out, to clean up the
    output of classification; so this method encapsulates the logic of
    that check.

    Args:
      x (float): the float to check

    Returns:
      ``True`` if ``x`` is close enough to log(0); else ``False``.
    """
    return x < 0 and math.isinf(x)

  def Features(self, x):
    """Returns a function mapping ``y`` to its feature vector given ``x``.

    Args:
      x (X): the value of the independent variable.

    Returns:
      A ``MemoizedFunction`` of type ``Y -> dict(str to float)``.
    """
    return self._features(x)

  def Score(self, x):
    """Returns a function mapping ``y`` to its "score" given ``x``.

    Semantically, the "score" of ``y`` given ``x`` is the
    unnormalized log-domain conditional probability of ``y`` given
    ``x``. Operationally, we compute this by taking the inner product
    of the weight covector with the ``Features(x)(y)`` vector. The
    conditional probability itself can be computed by exponentiating
    the "score" and normalizing with the partition function. However,
    for determining the "best" ``y`` we don't need to bother computing
    the probability, because ``Score(x)`` is monotonic with respect to
    ``Probability(x)``.

    Args:
      x (X): the value of the independent variable.

    Returns:
      A ``MemoizedFunction`` of type ``Y -> float``.
    """
    return self._scores(x)

  def FilterReasonWithWeight(self, reason):
    """Filters reasons with zero weights.

    Args:
      reason (dict): Dict mapping feature name to reason list.

    Returns:
      A list of reason strings.
    """
    flat_weight = self._meta_weight.leaves

    filtered_reasons = {}
    for non_zero_feature in flat_weight:
      if non_zero_feature in reason:
        filtered_reasons[non_zero_feature] = reason[non_zero_feature]

    sorted_reasons = sorted(
        filtered_reasons.items(),
        key=lambda item: -_FEATURE_TO_REASON_PRIORITY.get(
            item[0], -float('inf')))

    reasons = []
    for _, reason in sorted_reasons:
      reasons.extend(reason)

    return reasons
Exemplo n.º 10
0
class LogLinearModel(UnnormalizedLogLinearModel):
  """A loglinear probability model.

  This class is distinct from ``UnnormalizedLogLinearModel`` in that
  we can provide probabilities (not just scores). However, to do so we
  require a specification of the subsets of ``Y`` for each ``x``.
  """
  def __init__(self, Y_given_X, meta_feature, meta_weight, epsilon=None):
    """Construct a new probabilistic model.

    Args:
      Y_given_X: a function from ``X`` to an iterable object giving the
        subset of ``Y`` which has non-zero probability given the
        ``x``. When in doubt about whether some ``y`` has zero probability
        or not, it is always safe/correct to return a larger subset of
        ``Y`` (it'll just take more computation time is all). This is
        needed for computing the partition function and expectation. N.B.,
        we do not actually need to know/enumerate of *all* of ``Y``,
        only the subsets for each ``x``.
      meta_feature (MetaFeature): A function ``X -> Y -> MetaFeatureValue``.
        N.B., for all ``x`` and ``y`` the length of ``wrapped_feature(x)(y)``
        must be the same as the length of ``weights``.
       All features.
      meta_weight (MetaWeight): All weights. the weights for the features.
        The keys of the dictionary are the names of the feature that weight is
        for. We take this argument as a dict rather than as a list so that
        callers needn't worry about what order to provide the weights in.
      epsilon (float): The absolute-error threshold for considering a
        weight to be "equal to zero". N.B., this should be a positive
        number, as we will compare it against the absolute value of
        each weight.
    """
    super(LogLinearModel, self).__init__(meta_feature, meta_weight, epsilon)

    self._Y = Y_given_X

    def _LogZ(x):
      score_given_x = self._scores(x)
      return logsumexp([score_given_x(y) for y in self._Y(x)])
    self._zetas = MemoizedFunction(_LogZ)

  def ClearWeightBasedMemos(self):
    """Clear all the memos that depend on the weight covector."""
    super(LogLinearModel, self).ClearWeightBasedMemos()
    self._zetas.ClearMemos()

  def Z(self, x):
    """The normal-domain conditional partition-function given ``x``.

    If you need the log of the partition function, do not use this
    method. Instead, use the ``LogZ`` method which computes it directly
    and thus is more accurate than taking the log of this one.

    Args:
      x (X): the value of the independent variable.

    Returns:
      The normalizing constant of ``Probability(x)``.
    """
    return math.exp(self.LogZ(x))

  def LogZ(self, x):
    """The log-domain conditional partition-function given ``x``.

    Args:
      x (X): the value of the independent variable.

    Returns:
      The normalizing constant of ``LogProbability(x)``.
    """
    return self._zetas(x)

  def Probability(self, x):
    """The normal-domain distribution over ``y`` given ``x``.

    That is, ``self.Probability(x)(y)`` returns ``p(y | x; self._meta_weight)``
    which is the model's estimation of ``Pr(y|x)``.

    If you need the log-probability, don't use this method. Instead,
    use the ``LogProbability`` method which computes it directly and
    thus is more accurate than taking the log of this one.
    """
    logprob_given_x = self.LogProbability(x)
    return lambda y: math.exp(logprob_given_x(y))

  def LogProbability(self, x):
    """The log-domain distribution over ``y`` given ``x``.

    That is, ``self.LogProbability(x)(y)`` returns the log of
    ``self.Probability(x)(y)``. However, this method computes the
    log-probabilities directly and so is more accurate and efficient
    than taking the log of ``Probability``.
    """
    zeta_x = self.LogZ(x)
    score_given_x = self.Score(x)
    return lambda y: score_given_x(y) - zeta_x

  def Expectation(self, x, f):
    """Compute the expectation of a function with respect to ``Probability(x)``.

    Args:
      x (X): the value of the independent variable.
      f: a function of type ``Y -> np.ndarray(float)``.

    Returns:
      An ``np.ndarray`` of ``float`` called the "expected value". N.B.,
      the particular array returned may not actually be one that the
      function returns; rather, it's a sort of average of all the results
      returned. For more information you can take a look at Wikipedia
      <https://en.wikipedia.org/wiki/Expected_value>.
    """
    prob_given_x = self.Probability(x)
    # N.B., the ``*`` below is vector scaling! If we want to make this
    # method polymorphic in the return type of ``f`` then we'll need an
    # API that provides both scaling and ``vsum``.
    return vsum([prob_given_x(y) * f(y) for y in self._Y(x)])