Пример #1
0
class Tdigest(object):

    def __init__(self, delta=0.01, K=25, CX=1.1):
        self.delta = delta
        self.K = K
        self.CX = CX
        self.centroids = RBTree()
        self.nreset = 0
        self.reset()

    def reset(self):
        self.centroids.clear()
        self.n = 0
        self.nreset += 1
        self.last_cumulate = 0
        self.compressing = False

    def push(self, x, n=1):
        if not isinstance(x, list):
            x = [x]
        for item in x:
            self._digest(item, n)

    def percentile(self, p):
        if self.size() == 0:
            return None
        self._cumulate(True)
        cumn = self.n * p
        lower = self.centroids.min_item()[1]
        upper = self.centroids.max_item()[1]
        for c in self.centroids.values():
            if c.cumn <= cumn:
                lower = c
            else:
                upper = c
                break
        if lower == upper:
            return lower.mean
        return lower.mean + (cumn - lower.cumn) * (upper.mean - lower.mean) / \
            (upper.cumn - lower.cumn)

    def serialize(self):
        result = '%s~%s~%s~' % (self.delta, self.K, self.size())
        if self.size() == 0:
            return result
        self._cumulate(True)
        means = []
        counts = []
        for c in self.centroids.values():
            means.append(str(c.mean))
            counts.append(str(c.n))
        return '%s%s~%s' % (result, '~'.join(means), '~'.join(counts))

    @classmethod
    def deserialize(cls, serialized_str):
        if not isinstance(serialized_str, basestring):
            raise Exception(u'serialized_str must be str')
        data = serialized_str.split('~')
        t = Tdigest(delta=float(data[0]), K=int(data[1]))
        size = int(data[2])
        for i in xrange(size):
            t.push(float(data[i + 3]), int(data[size + i + 3]))
        t._cumulate(True)
        return t

    def _digest(self, x, n):
        if self.size() == 0:
            self._new_centroid(x, n, 0)
        else:
            _min = self.centroids.min_item()[1]
            _max = self.centroids.max_item()[1]
            nearest = self.find_nearest(x)
            if nearest and nearest.mean == x:
                self._addweight(nearest, x, n)
            elif nearest == _min:
                self._new_centroid(x, n, 0)
            elif nearest == _max:
                self._new_centroid(x, n, self.n)
            else:
                p = (nearest.cumn + nearest.n / 2.0) / self.n
                max_n = int(4 * self.n * self.delta * p * (1 - p))
                if max_n >= nearest.n + n:
                    self._addweight(nearest, x, n)
                else:
                    self._new_centroid(x, n, nearest.cumn)
        self._cumulate(False)
        if self.K and self.size() > self.K / self.delta:
            self.compress()

    def find_nearest(self, x):
        if self.size() == 0:
            return None
        try:
            lower = self.centroids.ceiling_item(x)[1]
        except KeyError:
            lower = None

        if lower and lower.mean == x:
            return lower

        try:
            prev = self.centroids.floor_item(x)[1]
        except KeyError:
            prev = None

        if not lower:
            return prev
        if not prev:
            return lower
        if abs(prev.mean - x) < abs(lower.mean - x):
            return prev
        else:
            return lower

    def size(self):
        return len(self.centroids)

    def compress(self):
        if self.compressing:
            return
        points = self.toList()
        self.reset()
        self.compressing = True
        for point in sorted(points, key=lambda x: random()):
            self.push(point['mean'], point['n'])
        self._cumulate(True)
        self.compressing = False

    def _cumulate(self, exact):
        if self.n == self.last_cumulate:
            return
        if not exact and self.CX and self.last_cumulate and \
                self.CX > (self.n / self.last_cumulate):
            return
        cumn = 0
        for c in self.centroids.values():
            cumn = c.cumn = cumn + c.n
        self.n = self.last_cumulate = cumn

    def toList(self):
        return [dict(mean=c.mean, n=c.n, cumn=c.cumn) for
                c in self.centroids.values()]

    def _addweight(self, nearest, x, n):
        if x != nearest.mean:
            nearest.mean += n * (x - nearest.mean) / (nearest.n + n)
        nearest.cumn += n
        nearest.n += n
        self.n += n

    def _new_centroid(self, x, n, cumn):
        c = Centroid(x, n, cumn)
        self.centroids.insert(x, c)
        self.n += n
        return c
Пример #2
0
class SparseArray(object):
    def __init__(self):
        self.tree = FastRBTree()

    def __len__(self):
        try:
            k, v = self.tree.max_item()
        except KeyError:
            return 0
        return k + len(v)

    def __getitem__(self, ndx):
        try:
            base, chunk = self.tree.floor_item(ndx)
        except KeyError:
            return None
        offset = ndx - base
        if offset < len(chunk):
            return chunk[offset]
        else:
            return None

    def __setitem__(self, ndx, item):
        try:
            base, chunk = self.tree.floor_item(ndx)
        except KeyError:
            try:
                base, chunk = self.tree.ceiling_item(ndx)
            except KeyError:
                self.tree[ndx] = [item]
                return
            if ndx + 1 == base:
                chunk.insert(0, item)
                del self.tree[base]
                self.tree[ndx] = chunk
                return

        if base > ndx:
            self.tree[ndx] = [item]
            return

        offset = ndx - base
        if offset < len(chunk):
            chunk[offset] = item
        else:
            nextbase, nextchunk = (None, None)
            try:
                nextbase, nextchunk = self.tree.succ_item(base)
            except KeyError:
                pass

            if offset == len(chunk):
                chunk.append(item)
                if offset + 1 == nextbase:
                    chunk += nextchunk
                    del self.tree[nextbase]
            elif offset + 1 == nextbase:
                nextchunk.insert(0, item)
                del self.tree[nextbase]
                self.tree[ndx] = nextchunk
            else:
                self.tree[ndx] = [item]

    def __delitem__(self, ndx):
        base, chunk = self.tree.floor_item(ndx)
        offset = ndx - base
        if offset < len(chunk):
            before = chunk[:offset]
            after = chunk[offset + 1:]
            if len(before):
                self.tree[base] = before
            else:
                del self.tree[base]
            if len(after):
                self.tree[ndx + 1] = after

    def items(self):
        for k, vs in self.tree.items():
            for n, v in enumerate(vs):
                yield (k + n, v)

    def runs(self):
        return self.tree.items()

    def run_count(self):
        return len(self.tree)

    def __repr__(self):
        arep = []
        for k, v in self.tree.items():
            arep.append('[%r]=%s' % (k, ', '.join([repr(item) for item in v])))
        return 'SparseArray(%s)' % ', '.join(arep)
Пример #3
0
class ExclusiveRangeDict(object):
  """A class like dict whose key is a range [begin, end) of integers.

  It has an attribute for each range of integers, for example:
  [10, 20) => Attribute(0),
  [20, 40) => Attribute(1),
  [40, 50) => Attribute(2),
  ...

  An instance of this class is accessed only via iter_range(begin, end).
  The instance is accessed as follows:

  1) If the given range [begin, end) is not covered by the instance,
  the range is newly created and iterated.

  2) If the given range [begin, end) exactly covers ranges in the instance,
  the ranges are iterated.
  (See test_set() in tests/range_dict_tests.py.)

  3) If the given range [begin, end) starts at and/or ends at a mid-point of
  an existing range, the existing range is split by the given range, and
  ranges in the given range are iterated.  For example, consider a case that
  [25, 45) is given to an instance of [20, 30), [30, 40), [40, 50).  In this
  case, [20, 30) is split into [20, 25) and [25, 30), and [40, 50) into
  [40, 45) and [45, 50).  Then, [25, 30), [30, 40), [40, 45) are iterated.
  (See test_split() in tests/range_dict_tests.py.)

  4) If the given range [begin, end) includes non-existing ranges in an
  instance, the gaps are filled with new ranges, and all ranges are iterated.
  For example, consider a case that [25, 50) is given to an instance of
  [30, 35) and [40, 45).  In this case, [25, 30), [35, 40) and [45, 50) are
  created in the instance, and then [25, 30), [30, 35), [35, 40), [40, 45)
  and [45, 50) are iterated.
  (See test_fill() in tests/range_dict_tests.py.)
  """
  class RangeAttribute(object):
    def __init__(self):
      pass

    def __str__(self):
      return '<RangeAttribute>'

    def __repr__(self):
      return '<RangeAttribute>'

    def copy(self):  # pylint: disable=R0201
      return ExclusiveRangeDict.RangeAttribute()

  def __init__(self, attr=RangeAttribute):
    self._tree = FastRBTree()
    self._attr = attr

  def iter_range(self, begin=None, end=None):
    if not begin:
      begin = self._tree.min_key()
    if not end:
      end = self._tree.max_item()[1][0]

    # Assume that self._tree has at least one element.
    if self._tree.is_empty():
      self._tree[begin] = (end, self._attr())

    # Create a beginning range (border)
    try:
      bound_begin, bound_value = self._tree.floor_item(begin)
      bound_end = bound_value[0]
      if begin >= bound_end:
        # Create a blank range.
        try:
          new_end, _ = self._tree.succ_item(bound_begin)
        except KeyError:
          new_end = end
        self._tree[begin] = (min(end, new_end), self._attr())
      elif bound_begin < begin and begin < bound_end:
        # Split the existing range.
        new_end = bound_value[0]
        new_value = bound_value[1]
        self._tree[bound_begin] = (begin, new_value.copy())
        self._tree[begin] = (new_end, new_value.copy())
      else:  # bound_begin == begin
        # Do nothing (just saying it clearly since this part is confusing)
        pass
    except KeyError:  # begin is less than the smallest element.
      # Create a blank range.
      # Note that we can assume self._tree has at least one element.
      self._tree[begin] = (min(end, self._tree.min_key()), self._attr())

    # Create an ending range (border)
    try:
      bound_begin, bound_value = self._tree.floor_item(end)
      bound_end = bound_value[0]
      if end > bound_end:
        # Create a blank range.
        new_begin = bound_end
        self._tree[new_begin] = (end, self._attr())
      elif bound_begin < end and end < bound_end:
        # Split the existing range.
        new_end = bound_value[0]
        new_value = bound_value[1]
        self._tree[bound_begin] = (end, new_value.copy())
        self._tree[end] = (new_end, new_value.copy())
      else:  # bound_begin == begin
        # Do nothing (just saying it clearly since this part is confusing)
        pass
    except KeyError:  # end is less than the smallest element.
      # It must not happen.  A blank range [begin,end) has already been created
      # even if [begin,end) is less than the smallest range.
      # Do nothing (just saying it clearly since this part is confusing)
      raise

    missing_ranges = []

    prev_end = None
    for range_begin, range_value in self._tree.itemslice(begin, end):
      range_end = range_value[0]
      # Note that we can assume that we have a range beginning with |begin|
      # and a range ending with |end| (they may be the same range).
      if prev_end and prev_end != range_begin:
        missing_ranges.append((prev_end, range_begin))
      prev_end = range_end

    for missing_begin, missing_end in missing_ranges:
      self._tree[missing_begin] = (missing_end, self._attr())

    for range_begin, range_value in self._tree.itemslice(begin, end):
      yield range_begin, range_value[0], range_value[1]

  def __str__(self):
    return str(self._tree)
Пример #4
0
prev[ 0 ] = 0
cur = FastRBTree()
cur[ 0 ] = 0
i = 0
for elem in newInput:
   v = elem[ 2 ]
   w = elem[ 1 ]
   #for line in fin:
   #info = line.split()
   #v = int( info[ 0 ] )
   #w = int( info[ 1 ] )
   #print newItems - i, (v,w), len( prev )#, len( testSet )
   i += 1
   for stepWeight in prev:
      step = [ stepWeight, prev[ stepWeight ] ]
      curv = cur.floor_item( step[ 0 ] )[ 1 ]
      maxv = max( step[ 1 ], curv )
      # compare prev and cur on same weight
      if maxv == step[ 1 ]:
         cur[ step[ 0 ] ] = maxv

      nextw = step[ 0 ] + w
      # using step weight as base, compare value of 
      # prev val( step weight ) + item val --> with current item
      # and prev val( step weight + item weight ) --> without current item
      if nextw < size and prev.floor_item( nextw )[ 1 ] < step[ 1 ] + v:
         cur[ nextw ] = step[ 1 ] + v
   prev = cur
   cur = FastRBTree()
   cur[ 0 ] = 0
Пример #5
0
class ExclusiveRangeDict(object):
    """A class like dict whose key is a range [begin, end) of integers.

  It has an attribute for each range of integers, for example:
  [10, 20) => Attribute(0),
  [20, 40) => Attribute(1),
  [40, 50) => Attribute(2),
  ...

  An instance of this class is accessed only via iter_range(begin, end).
  The instance is accessed as follows:

  1) If the given range [begin, end) is not covered by the instance,
  the range is newly created and iterated.

  2) If the given range [begin, end) exactly covers ranges in the instance,
  the ranges are iterated.
  (See test_set() in tests/range_dict_tests.py.)

  3) If the given range [begin, end) starts at and/or ends at a mid-point of
  an existing range, the existing range is split by the given range, and
  ranges in the given range are iterated.  For example, consider a case that
  [25, 45) is given to an instance of [20, 30), [30, 40), [40, 50).  In this
  case, [20, 30) is split into [20, 25) and [25, 30), and [40, 50) into
  [40, 45) and [45, 50).  Then, [25, 30), [30, 40), [40, 45) are iterated.
  (See test_split() in tests/range_dict_tests.py.)

  4) If the given range [begin, end) includes non-existing ranges in an
  instance, the gaps are filled with new ranges, and all ranges are iterated.
  For example, consider a case that [25, 50) is given to an instance of
  [30, 35) and [40, 45).  In this case, [25, 30), [35, 40) and [45, 50) are
  created in the instance, and then [25, 30), [30, 35), [35, 40), [40, 45)
  and [45, 50) are iterated.
  (See test_fill() in tests/range_dict_tests.py.)
  """
    class RangeAttribute(object):
        def __init__(self):
            pass

        def __str__(self):
            return '<RangeAttribute>'

        def __repr__(self):
            return '<RangeAttribute>'

        def copy(self):  # pylint: disable=R0201
            return ExclusiveRangeDict.RangeAttribute()

    def __init__(self, attr=RangeAttribute):
        self._tree = FastRBTree()
        self._attr = attr

    def iter_range(self, begin=None, end=None):
        if not begin:
            begin = self._tree.min_key()
        if not end:
            end = self._tree.max_item()[1][0]

        # Assume that self._tree has at least one element.
        if self._tree.is_empty():
            self._tree[begin] = (end, self._attr())

        # Create a beginning range (border)
        try:
            bound_begin, bound_value = self._tree.floor_item(begin)
            bound_end = bound_value[0]
            if begin >= bound_end:
                # Create a blank range.
                try:
                    new_end, _ = self._tree.succ_item(bound_begin)
                except KeyError:
                    new_end = end
                self._tree[begin] = (min(end, new_end), self._attr())
            elif bound_begin < begin and begin < bound_end:
                # Split the existing range.
                new_end = bound_value[0]
                new_value = bound_value[1]
                self._tree[bound_begin] = (begin, new_value.copy())
                self._tree[begin] = (new_end, new_value.copy())
            else:  # bound_begin == begin
                # Do nothing (just saying it clearly since this part is confusing)
                pass
        except KeyError:  # begin is less than the smallest element.
            # Create a blank range.
            # Note that we can assume self._tree has at least one element.
            self._tree[begin] = (min(end, self._tree.min_key()), self._attr())

        # Create an ending range (border)
        try:
            bound_begin, bound_value = self._tree.floor_item(end)
            bound_end = bound_value[0]
            if end > bound_end:
                # Create a blank range.
                new_begin = bound_end
                self._tree[new_begin] = (end, self._attr())
            elif bound_begin < end and end < bound_end:
                # Split the existing range.
                new_end = bound_value[0]
                new_value = bound_value[1]
                self._tree[bound_begin] = (end, new_value.copy())
                self._tree[end] = (new_end, new_value.copy())
            else:  # bound_begin == begin
                # Do nothing (just saying it clearly since this part is confusing)
                pass
        except KeyError:  # end is less than the smallest element.
            # It must not happen.  A blank range [begin,end) has already been created
            # even if [begin,end) is less than the smallest range.
            # Do nothing (just saying it clearly since this part is confusing)
            raise

        missing_ranges = []

        prev_end = None
        for range_begin, range_value in self._tree.itemslice(begin, end):
            range_end = range_value[0]
            # Note that we can assume that we have a range beginning with |begin|
            # and a range ending with |end| (they may be the same range).
            if prev_end and prev_end != range_begin:
                missing_ranges.append((prev_end, range_begin))
            prev_end = range_end

        for missing_begin, missing_end in missing_ranges:
            self._tree[missing_begin] = (missing_end, self._attr())

        for range_begin, range_value in self._tree.itemslice(begin, end):
            yield range_begin, range_value[0], range_value[1]

    def __str__(self):
        return str(self._tree)
Пример #6
0
class RangeSet(object):
    def __init__(self, ranges):
        self.tree = FastRBTree()

        for r in ranges:
            self.add(r)

    def add(self, rng):
        rs, re = rng
        ds, de = (None, None)
        try:
            ls, le = self.tree.floor_item(rs)
            # If we get here, ls <= rng.start
            if le >= rs - 1:
                de = ds = ls
                rs = ls
        except KeyError:
            pass

        for s, e in self.tree[rs:re + 2].items():
            if ds is None:
                ds = s
            de = s
            if e > re:
                re = e

        if ds is not None:
            del self.tree[ds:de + 1]
        self.tree[rs] = re

    def remove(self, rng):
        rs, re = rng
        ds, de = (rs, re)
        try:
            ls, le = self.tree.floor_item(rs)

            # Truncate an initial range, if any
            if ls < rs and le >= rs:
                self.tree[ls] = rs - 1
                if le > re:
                    self.tree[re + 1] = le
        except KeyError:
            pass

        ins = None
        for s, e in self.tree[rs:re + 1].items():
            de = s
            if e > re:
                self.tree[re + 1] = e

        del self.tree[ds:de + 1]

    def range_containing(self, pos):
        ls, le = self.tree.floor_item(pos)
        if ls <= pos and le >= pos:
            return (ls, le)
        return None

    def __repr__(self):
        return 'RangeSet([%s])' % ', '.join(
            ['(%s, %s)' % (s, e) for s, e in self.tree.items()])

    def __iter__(self):
        return iter(self.tree)

    def items(self):
        return self.tree.items()