Exemplo n.º 1
0
class TDigest(object):

    def __init__(self, delta=0.01, K=25):
        self.C = RBTree()
        self.n = 0
        self.delta = delta
        self.K = K

    def __add__(self, other_digest):
        C1 = list(self.C.values())
        C2 = list(other_digest.C.values())
        shuffle(C1)
        shuffle(C2)
        data = C1 + C2
        new_digest = TDigest(self.delta, self.K)
        for c in data:
            new_digest.update(c.mean, c.count)

        return new_digest

    def __len__(self):
        return len(self.C)

    def __repr__(self):
        return """<T-Digest: n=%d, centroids=%d>""" % (self.n, len(self))

    def _add_centroid(self, centroid):
        if centroid.mean not in self.C:
            self.C.insert(centroid.mean, centroid)
        else:
            self.C[centroid.mean].update(centroid.mean, centroid.count)

    def _compute_centroid_quantile(self, centroid):
        denom = self.n
        cumulative_sum = sum(
            c_i.count for c_i in self.C.value_slice(-float('Inf'), centroid.mean))
        return (centroid.count / 2. + cumulative_sum) / denom

    def _update_centroid(self, centroid, x, w):
        self.C.pop(centroid.mean)
        centroid.update(x, w)
        self._add_centroid(centroid)

    def _find_closest_centroids(self, x):
        try:
            ceil_key = self.C.ceiling_key(x)
        except KeyError:
            floor_key = self.C.floor_key(x)
            return [self.C[floor_key]]

        try:
            floor_key = self.C.floor_key(x)
        except KeyError:
            ceil_key = self.C.ceiling_key(x)
            return [self.C[ceil_key]]

        if abs(floor_key - x) < abs(ceil_key - x):
            return [self.C[floor_key]]
        elif abs(floor_key - x) == abs(ceil_key - x) and (ceil_key != floor_key):
            return [self.C[ceil_key], self.C[floor_key]]
        else:
            return [self.C[ceil_key]]

    def _theshold(self, q):
        return 4 * self.n * self.delta * q * (1 - q)

    def update(self, x, w=1):
        """
        Update the t-digest with value x and weight w.

        """
        self.n += w

        if len(self) == 0:
            self._add_centroid(Centroid(x, w))
            return

        S = self._find_closest_centroids(x)

        while len(S) != 0 and w > 0:
            j = choice(list(range(len(S))))
            c_j = S[j]

            q = self._compute_centroid_quantile(c_j)

            # This filters the out centroids that do not satisfy the second part
            # of the definition of S. See original paper by Dunning. 
            if c_j.count + w > self._theshold(q):
                S.pop(j)
                continue

            delta_w = min(self._theshold(q) - c_j.count, w)
            self._update_centroid(c_j, x, delta_w)
            w -= delta_w
            S.pop(j)

        if w > 0:
            self._add_centroid(Centroid(x, w))

        if len(self) > self.K / self.delta:
            self.compress()

        return

    def batch_update(self, values, w=1):
        """
        Update the t-digest with an iterable of values. This assumes all points have the 
        same weight.
        """
        for x in values:
            self.update(x, w)
        self.compress()
        return

    def compress(self):
        T = TDigest(self.delta, self.K)
        C = list(self.C.values())
        shuffle(C)
        for c_i in C:
            T.update(c_i.mean, c_i.count)
        self.C = T.C

    def percentile(self, q):
        """ 
        Computes the percentile of a specific value in [0,1], ie. computes F^{-1}(q) where F^{-1} denotes
        the inverse CDF of the distribution. 

        """
        if not (0 <= q <= 1):
            raise ValueError("q must be between 0 and 1, inclusive.")

        t = 0
        q *= self.n

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k = c_i.count
            if q < t + k:
                if i == 0:
                    return c_i.mean
                elif i == len(self) - 1:
                    return c_i.mean
                else:
                    delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2.
                return c_i.mean + ((q - t) / k - 0.5) * delta

            t += k
        return self.C.max_item()[1].mean

    def quantile(self, q):
        """ 
        Computes the quantile of a specific value, ie. computes F(q) where F denotes
        the CDF of the distribution. 

        """
        t = 0
        N = float(self.n)

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            if i == len(self) - 1:
                delta = (c_i.mean - self.C.prev_item(key)[1].mean) / 2.
            else:
                delta = (self.C.succ_item(key)[1].mean - c_i.mean) / 2.
            z = max(-1, (q - c_i.mean) / delta)

            if z < 1:
                return t / N + c_i.count / N * (z + 1) / 2

            t += c_i.count
        return 1

    def trimmed_mean(self, q1, q2):
        """
        Computes the mean of the distribution between the two percentiles q1 and q2.
        This is a modified algorithm than the one presented in the original t-Digest paper. 

        """
        if not (q1 < q2):
            raise ValueError("q must be between 0 and 1, inclusive.")

        s = k = t = 0
        q1 *= self.n
        q2 *= self.n
        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k_i = c_i.count
            if q1 < t + k_i:
                if i == 0:
                    delta = self.C.succ_item(key)[1].mean - c_i.mean
                elif i == len(self) - 1:
                    delta = c_i.mean - self.C.prev_item(key)[1].mean
                else:
                    delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2.
                nu = ((q1 - t) / k_i - 0.5) * delta
                s += nu * k_i * c_i.mean
                k += nu * k_i

            if q2 < t + k_i:
                return s/k
            t += k_i

        return s/k
Exemplo n.º 2
0
class SparseArray(object):
    def __init__(self):
        self.tree = FastRBTree()

    def __len__(self):
        try:
            k, v = self.tree.max_item()
        except KeyError:
            return 0
        return k + len(v)

    def __getitem__(self, ndx):
        try:
            base, chunk = self.tree.floor_item(ndx)
        except KeyError:
            return None
        offset = ndx - base
        if offset < len(chunk):
            return chunk[offset]
        else:
            return None

    def __setitem__(self, ndx, item):
        try:
            base, chunk = self.tree.floor_item(ndx)
        except KeyError:
            try:
                base, chunk = self.tree.ceiling_item(ndx)
            except KeyError:
                self.tree[ndx] = [item]
                return
            if ndx + 1 == base:
                chunk.insert(0, item)
                del self.tree[base]
                self.tree[ndx] = chunk
                return

        if base > ndx:
            self.tree[ndx] = [item]
            return

        offset = ndx - base
        if offset < len(chunk):
            chunk[offset] = item
        else:
            nextbase, nextchunk = (None, None)
            try:
                nextbase, nextchunk = self.tree.succ_item(base)
            except KeyError:
                pass

            if offset == len(chunk):
                chunk.append(item)
                if offset + 1 == nextbase:
                    chunk += nextchunk
                    del self.tree[nextbase]
            elif offset + 1 == nextbase:
                nextchunk.insert(0, item)
                del self.tree[nextbase]
                self.tree[ndx] = nextchunk
            else:
                self.tree[ndx] = [item]

    def __delitem__(self, ndx):
        base, chunk = self.tree.floor_item(ndx)
        offset = ndx - base
        if offset < len(chunk):
            before = chunk[:offset]
            after = chunk[offset + 1:]
            if len(before):
                self.tree[base] = before
            else:
                del self.tree[base]
            if len(after):
                self.tree[ndx + 1] = after

    def items(self):
        for k, vs in self.tree.items():
            for n, v in enumerate(vs):
                yield (k + n, v)

    def runs(self):
        return self.tree.items()

    def run_count(self):
        return len(self.tree)

    def __repr__(self):
        arep = []
        for k, v in self.tree.items():
            arep.append('[%r]=%s' % (k, ', '.join([repr(item) for item in v])))
        return 'SparseArray(%s)' % ', '.join(arep)
Exemplo n.º 3
0
class ExclusiveRangeDict(object):
    """A class like dict whose key is a range [begin, end) of integers.

  It has an attribute for each range of integers, for example:
  [10, 20) => Attribute(0),
  [20, 40) => Attribute(1),
  [40, 50) => Attribute(2),
  ...

  An instance of this class is accessed only via iter_range(begin, end).
  The instance is accessed as follows:

  1) If the given range [begin, end) is not covered by the instance,
  the range is newly created and iterated.

  2) If the given range [begin, end) exactly covers ranges in the instance,
  the ranges are iterated.
  (See test_set() in tests/range_dict_tests.py.)

  3) If the given range [begin, end) starts at and/or ends at a mid-point of
  an existing range, the existing range is split by the given range, and
  ranges in the given range are iterated.  For example, consider a case that
  [25, 45) is given to an instance of [20, 30), [30, 40), [40, 50).  In this
  case, [20, 30) is split into [20, 25) and [25, 30), and [40, 50) into
  [40, 45) and [45, 50).  Then, [25, 30), [30, 40), [40, 45) are iterated.
  (See test_split() in tests/range_dict_tests.py.)

  4) If the given range [begin, end) includes non-existing ranges in an
  instance, the gaps are filled with new ranges, and all ranges are iterated.
  For example, consider a case that [25, 50) is given to an instance of
  [30, 35) and [40, 45).  In this case, [25, 30), [35, 40) and [45, 50) are
  created in the instance, and then [25, 30), [30, 35), [35, 40), [40, 45)
  and [45, 50) are iterated.
  (See test_fill() in tests/range_dict_tests.py.)
  """
    class RangeAttribute(object):
        def __init__(self):
            pass

        def __str__(self):
            return '<RangeAttribute>'

        def __repr__(self):
            return '<RangeAttribute>'

        def copy(self):  # pylint: disable=R0201
            return ExclusiveRangeDict.RangeAttribute()

    def __init__(self, attr=RangeAttribute):
        self._tree = FastRBTree()
        self._attr = attr

    def iter_range(self, begin=None, end=None):
        if not begin:
            begin = self._tree.min_key()
        if not end:
            end = self._tree.max_item()[1][0]

        # Assume that self._tree has at least one element.
        if self._tree.is_empty():
            self._tree[begin] = (end, self._attr())

        # Create a beginning range (border)
        try:
            bound_begin, bound_value = self._tree.floor_item(begin)
            bound_end = bound_value[0]
            if begin >= bound_end:
                # Create a blank range.
                try:
                    new_end, _ = self._tree.succ_item(bound_begin)
                except KeyError:
                    new_end = end
                self._tree[begin] = (min(end, new_end), self._attr())
            elif bound_begin < begin and begin < bound_end:
                # Split the existing range.
                new_end = bound_value[0]
                new_value = bound_value[1]
                self._tree[bound_begin] = (begin, new_value.copy())
                self._tree[begin] = (new_end, new_value.copy())
            else:  # bound_begin == begin
                # Do nothing (just saying it clearly since this part is confusing)
                pass
        except KeyError:  # begin is less than the smallest element.
            # Create a blank range.
            # Note that we can assume self._tree has at least one element.
            self._tree[begin] = (min(end, self._tree.min_key()), self._attr())

        # Create an ending range (border)
        try:
            bound_begin, bound_value = self._tree.floor_item(end)
            bound_end = bound_value[0]
            if end > bound_end:
                # Create a blank range.
                new_begin = bound_end
                self._tree[new_begin] = (end, self._attr())
            elif bound_begin < end and end < bound_end:
                # Split the existing range.
                new_end = bound_value[0]
                new_value = bound_value[1]
                self._tree[bound_begin] = (end, new_value.copy())
                self._tree[end] = (new_end, new_value.copy())
            else:  # bound_begin == begin
                # Do nothing (just saying it clearly since this part is confusing)
                pass
        except KeyError:  # end is less than the smallest element.
            # It must not happen.  A blank range [begin,end) has already been created
            # even if [begin,end) is less than the smallest range.
            # Do nothing (just saying it clearly since this part is confusing)
            raise

        missing_ranges = []

        prev_end = None
        for range_begin, range_value in self._tree.itemslice(begin, end):
            range_end = range_value[0]
            # Note that we can assume that we have a range beginning with |begin|
            # and a range ending with |end| (they may be the same range).
            if prev_end and prev_end != range_begin:
                missing_ranges.append((prev_end, range_begin))
            prev_end = range_end

        for missing_begin, missing_end in missing_ranges:
            self._tree[missing_begin] = (missing_end, self._attr())

        for range_begin, range_value in self._tree.itemslice(begin, end):
            yield range_begin, range_value[0], range_value[1]

    def __str__(self):
        return str(self._tree)
Exemplo n.º 4
0
class ExclusiveRangeDict(object):
  """A class like dict whose key is a range [begin, end) of integers.

  It has an attribute for each range of integers, for example:
  [10, 20) => Attribute(0),
  [20, 40) => Attribute(1),
  [40, 50) => Attribute(2),
  ...

  An instance of this class is accessed only via iter_range(begin, end).
  The instance is accessed as follows:

  1) If the given range [begin, end) is not covered by the instance,
  the range is newly created and iterated.

  2) If the given range [begin, end) exactly covers ranges in the instance,
  the ranges are iterated.
  (See test_set() in tests/range_dict_tests.py.)

  3) If the given range [begin, end) starts at and/or ends at a mid-point of
  an existing range, the existing range is split by the given range, and
  ranges in the given range are iterated.  For example, consider a case that
  [25, 45) is given to an instance of [20, 30), [30, 40), [40, 50).  In this
  case, [20, 30) is split into [20, 25) and [25, 30), and [40, 50) into
  [40, 45) and [45, 50).  Then, [25, 30), [30, 40), [40, 45) are iterated.
  (See test_split() in tests/range_dict_tests.py.)

  4) If the given range [begin, end) includes non-existing ranges in an
  instance, the gaps are filled with new ranges, and all ranges are iterated.
  For example, consider a case that [25, 50) is given to an instance of
  [30, 35) and [40, 45).  In this case, [25, 30), [35, 40) and [45, 50) are
  created in the instance, and then [25, 30), [30, 35), [35, 40), [40, 45)
  and [45, 50) are iterated.
  (See test_fill() in tests/range_dict_tests.py.)
  """
  class RangeAttribute(object):
    def __init__(self):
      pass

    def __str__(self):
      return '<RangeAttribute>'

    def __repr__(self):
      return '<RangeAttribute>'

    def copy(self):  # pylint: disable=R0201
      return ExclusiveRangeDict.RangeAttribute()

  def __init__(self, attr=RangeAttribute):
    self._tree = FastRBTree()
    self._attr = attr

  def iter_range(self, begin=None, end=None):
    if not begin:
      begin = self._tree.min_key()
    if not end:
      end = self._tree.max_item()[1][0]

    # Assume that self._tree has at least one element.
    if self._tree.is_empty():
      self._tree[begin] = (end, self._attr())

    # Create a beginning range (border)
    try:
      bound_begin, bound_value = self._tree.floor_item(begin)
      bound_end = bound_value[0]
      if begin >= bound_end:
        # Create a blank range.
        try:
          new_end, _ = self._tree.succ_item(bound_begin)
        except KeyError:
          new_end = end
        self._tree[begin] = (min(end, new_end), self._attr())
      elif bound_begin < begin and begin < bound_end:
        # Split the existing range.
        new_end = bound_value[0]
        new_value = bound_value[1]
        self._tree[bound_begin] = (begin, new_value.copy())
        self._tree[begin] = (new_end, new_value.copy())
      else:  # bound_begin == begin
        # Do nothing (just saying it clearly since this part is confusing)
        pass
    except KeyError:  # begin is less than the smallest element.
      # Create a blank range.
      # Note that we can assume self._tree has at least one element.
      self._tree[begin] = (min(end, self._tree.min_key()), self._attr())

    # Create an ending range (border)
    try:
      bound_begin, bound_value = self._tree.floor_item(end)
      bound_end = bound_value[0]
      if end > bound_end:
        # Create a blank range.
        new_begin = bound_end
        self._tree[new_begin] = (end, self._attr())
      elif bound_begin < end and end < bound_end:
        # Split the existing range.
        new_end = bound_value[0]
        new_value = bound_value[1]
        self._tree[bound_begin] = (end, new_value.copy())
        self._tree[end] = (new_end, new_value.copy())
      else:  # bound_begin == begin
        # Do nothing (just saying it clearly since this part is confusing)
        pass
    except KeyError:  # end is less than the smallest element.
      # It must not happen.  A blank range [begin,end) has already been created
      # even if [begin,end) is less than the smallest range.
      # Do nothing (just saying it clearly since this part is confusing)
      raise

    missing_ranges = []

    prev_end = None
    for range_begin, range_value in self._tree.itemslice(begin, end):
      range_end = range_value[0]
      # Note that we can assume that we have a range beginning with |begin|
      # and a range ending with |end| (they may be the same range).
      if prev_end and prev_end != range_begin:
        missing_ranges.append((prev_end, range_begin))
      prev_end = range_end

    for missing_begin, missing_end in missing_ranges:
      self._tree[missing_begin] = (missing_end, self._attr())

    for range_begin, range_value in self._tree.itemslice(begin, end):
      yield range_begin, range_value[0], range_value[1]

  def __str__(self):
    return str(self._tree)
Exemplo n.º 5
0
class TDigest(object):
    def __init__(self, delta=0.01, K=25):
        self.C = RBTree()
        self.n = 0
        self.delta = delta
        self.K = K

    def __add__(self, other_digest):
        data = list(chain(self.C.values(), other_digest.C.values()))
        new_digest = TDigest(self.delta, self.K)

        if len(data) > 0:
            for c in pyudorandom.items(data):
                new_digest.update(c.mean, c.count)

        return new_digest

    def __len__(self):
        return len(self.C)

    def __repr__(self):
        return """<T-Digest: n=%d, centroids=%d>""" % (self.n, len(self))

    def _add_centroid(self, centroid):
        if centroid.mean not in self.C:
            self.C.insert(centroid.mean, centroid)
        else:
            self.C[centroid.mean].update(centroid.mean, centroid.count)

    def _compute_centroid_quantile(self, centroid):
        denom = self.n
        cumulative_sum = sum(
            c_i.count
            for c_i in self.C.value_slice(-float('Inf'), centroid.mean))
        return (centroid.count / 2. + cumulative_sum) / denom

    def _update_centroid(self, centroid, x, w):
        self.C.pop(centroid.mean)
        centroid.update(x, w)
        self._add_centroid(centroid)

    def _find_closest_centroids(self, x):
        try:
            ceil_key = self.C.ceiling_key(x)
        except KeyError:
            floor_key = self.C.floor_key(x)
            return [self.C[floor_key]]

        try:
            floor_key = self.C.floor_key(x)
        except KeyError:
            ceil_key = self.C.ceiling_key(x)
            return [self.C[ceil_key]]

        if abs(floor_key - x) < abs(ceil_key - x):
            return [self.C[floor_key]]
        elif abs(floor_key - x) == abs(ceil_key -
                                       x) and (ceil_key != floor_key):
            return [self.C[ceil_key], self.C[floor_key]]
        else:
            return [self.C[ceil_key]]

    def _theshold(self, q):
        return 4 * self.n * self.delta * q * (1 - q)

    def update(self, x, w=1):
        """
        Update the t-digest with value x and weight w.

        """
        self.n += w

        if len(self) == 0:
            self._add_centroid(Centroid(x, w))
            return

        S = self._find_closest_centroids(x)

        while len(S) != 0 and w > 0:
            j = choice(list(range(len(S))))
            c_j = S[j]

            q = self._compute_centroid_quantile(c_j)

            # This filters the out centroids that do not satisfy the second part
            # of the definition of S. See original paper by Dunning.
            if c_j.count + w > self._theshold(q):
                S.pop(j)
                continue

            delta_w = min(self._theshold(q) - c_j.count, w)
            self._update_centroid(c_j, x, delta_w)
            w -= delta_w
            S.pop(j)

        if w > 0:
            self._add_centroid(Centroid(x, w))

        if len(self) > self.K / self.delta:
            self.compress()

        return

    def batch_update(self, values, w=1):
        """
        Update the t-digest with an iterable of values. This assumes all points have the 
        same weight.
        """
        for x in values:
            self.update(x, w)
        self.compress()
        return

    def compress(self):
        T = TDigest(self.delta, self.K)
        C = list(self.C.values())
        for c_i in pyudorandom.items(C):
            T.update(c_i.mean, c_i.count)
        self.C = T.C

    def percentile(self, p):
        """ 
        Computes the percentile of a specific value in [0,100].

        """
        if not (0 <= p <= 100):
            raise ValueError("p must be between 0 and 100, inclusive.")

        t = 0
        p = float(p) / 100.
        p *= self.n

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k = c_i.count
            if p < t + k:
                if i == 0:
                    return c_i.mean
                elif i == len(self) - 1:
                    return c_i.mean
                else:
                    delta = (self.C.succ_item(key)[1].mean -
                             self.C.prev_item(key)[1].mean) / 2.
                return c_i.mean + ((p - t) / k - 0.5) * delta

            t += k
        return self.C.max_item()[1].mean

    def quantile(self, q):
        """ 
        Computes the quantile of a specific value, ie. computes F(q) where F denotes
        the CDF of the distribution. 

        """
        t = 0
        N = float(self.n)

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            if i == len(self) - 1:
                delta = (c_i.mean - self.C.prev_item(key)[1].mean) / 2.
            else:
                delta = (self.C.succ_item(key)[1].mean - c_i.mean) / 2.
            z = max(-1, (q - c_i.mean) / delta)

            if z < 1:
                return t / N + c_i.count / N * (z + 1) / 2

            t += c_i.count
        return 1

    def trimmed_mean(self, p1, p2):
        """
        Computes the mean of the distribution between the two percentiles p1 and p2.
        This is a modified algorithm than the one presented in the original t-Digest paper. 

        """
        if not (p1 < p2):
            raise ValueError("p1 must be between 0 and 100 and less than p2.")

        s = k = t = 0
        p1 /= 100.
        p2 /= 100.
        p1 *= self.n
        p2 *= self.n
        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k_i = c_i.count
            if p1 < t + k_i:
                if i == 0:
                    delta = self.C.succ_item(key)[1].mean - c_i.mean
                elif i == len(self) - 1:
                    delta = c_i.mean - self.C.prev_item(key)[1].mean
                else:
                    delta = (self.C.succ_item(key)[1].mean -
                             self.C.prev_item(key)[1].mean) / 2.
                nu = ((p1 - t) / k_i - 0.5) * delta
                s += nu * k_i * c_i.mean
                k += nu * k_i

            if p2 < t + k_i:
                return s / k
            t += k_i

        return s / k