Exemplo n.º 1
0
class ExclusiveRangeDict(object):
  """A class like dict whose key is a range [begin, end) of integers.

  It has an attribute for each range of integers, for example:
  [10, 20) => Attribute(0),
  [20, 40) => Attribute(1),
  [40, 50) => Attribute(2),
  ...

  An instance of this class is accessed only via iter_range(begin, end).
  The instance is accessed as follows:

  1) If the given range [begin, end) is not covered by the instance,
  the range is newly created and iterated.

  2) If the given range [begin, end) exactly covers ranges in the instance,
  the ranges are iterated.
  (See test_set() in tests/range_dict_tests.py.)

  3) If the given range [begin, end) starts at and/or ends at a mid-point of
  an existing range, the existing range is split by the given range, and
  ranges in the given range are iterated.  For example, consider a case that
  [25, 45) is given to an instance of [20, 30), [30, 40), [40, 50).  In this
  case, [20, 30) is split into [20, 25) and [25, 30), and [40, 50) into
  [40, 45) and [45, 50).  Then, [25, 30), [30, 40), [40, 45) are iterated.
  (See test_split() in tests/range_dict_tests.py.)

  4) If the given range [begin, end) includes non-existing ranges in an
  instance, the gaps are filled with new ranges, and all ranges are iterated.
  For example, consider a case that [25, 50) is given to an instance of
  [30, 35) and [40, 45).  In this case, [25, 30), [35, 40) and [45, 50) are
  created in the instance, and then [25, 30), [30, 35), [35, 40), [40, 45)
  and [45, 50) are iterated.
  (See test_fill() in tests/range_dict_tests.py.)
  """
  class RangeAttribute(object):
    def __init__(self):
      pass

    def __str__(self):
      return '<RangeAttribute>'

    def __repr__(self):
      return '<RangeAttribute>'

    def copy(self):  # pylint: disable=R0201
      return ExclusiveRangeDict.RangeAttribute()

  def __init__(self, attr=RangeAttribute):
    self._tree = FastRBTree()
    self._attr = attr

  def iter_range(self, begin=None, end=None):
    if not begin:
      begin = self._tree.min_key()
    if not end:
      end = self._tree.max_item()[1][0]

    # Assume that self._tree has at least one element.
    if self._tree.is_empty():
      self._tree[begin] = (end, self._attr())

    # Create a beginning range (border)
    try:
      bound_begin, bound_value = self._tree.floor_item(begin)
      bound_end = bound_value[0]
      if begin >= bound_end:
        # Create a blank range.
        try:
          new_end, _ = self._tree.succ_item(bound_begin)
        except KeyError:
          new_end = end
        self._tree[begin] = (min(end, new_end), self._attr())
      elif bound_begin < begin and begin < bound_end:
        # Split the existing range.
        new_end = bound_value[0]
        new_value = bound_value[1]
        self._tree[bound_begin] = (begin, new_value.copy())
        self._tree[begin] = (new_end, new_value.copy())
      else:  # bound_begin == begin
        # Do nothing (just saying it clearly since this part is confusing)
        pass
    except KeyError:  # begin is less than the smallest element.
      # Create a blank range.
      # Note that we can assume self._tree has at least one element.
      self._tree[begin] = (min(end, self._tree.min_key()), self._attr())

    # Create an ending range (border)
    try:
      bound_begin, bound_value = self._tree.floor_item(end)
      bound_end = bound_value[0]
      if end > bound_end:
        # Create a blank range.
        new_begin = bound_end
        self._tree[new_begin] = (end, self._attr())
      elif bound_begin < end and end < bound_end:
        # Split the existing range.
        new_end = bound_value[0]
        new_value = bound_value[1]
        self._tree[bound_begin] = (end, new_value.copy())
        self._tree[end] = (new_end, new_value.copy())
      else:  # bound_begin == begin
        # Do nothing (just saying it clearly since this part is confusing)
        pass
    except KeyError:  # end is less than the smallest element.
      # It must not happen.  A blank range [begin,end) has already been created
      # even if [begin,end) is less than the smallest range.
      # Do nothing (just saying it clearly since this part is confusing)
      raise

    missing_ranges = []

    prev_end = None
    for range_begin, range_value in self._tree.itemslice(begin, end):
      range_end = range_value[0]
      # Note that we can assume that we have a range beginning with |begin|
      # and a range ending with |end| (they may be the same range).
      if prev_end and prev_end != range_begin:
        missing_ranges.append((prev_end, range_begin))
      prev_end = range_end

    for missing_begin, missing_end in missing_ranges:
      self._tree[missing_begin] = (missing_end, self._attr())

    for range_begin, range_value in self._tree.itemslice(begin, end):
      yield range_begin, range_value[0], range_value[1]

  def __str__(self):
    return str(self._tree)
Exemplo n.º 2
0
class OrderBook(object):
    """
    Uses RBTrees to handle all types of orders and store them in their corresponding bucket
    """
    def __init__(self, product_id: str):
        self._asks = RBTree()
        self._bids = RBTree()
        self._product_id = product_id

    @property
    def product_id(self):
        return self._product_id

    def process_snapshot(self, message: Dict):
        """
        Process a snapshot message
        :param message: json
        """

        # If a snapshot is sent reset trees
        self._asks = RBTree()
        self._bids = RBTree()

        # Parse all asks and add them to tree
        for ask in message['asks']:
            price, size = ask
            price = Decimal(price)
            size = Decimal(size)

            self._asks.insert(price, size)

        # Parse all bids and add them to tree
        for bid in message['bids']:
            price, size = bid
            price = Decimal(price)
            size = Decimal(size)

            self._bids.insert(price, size)

    def process_update(self, message: Dict):
        """
        Process a update message
        :param message: json
        """

        # Retrieve changes
        changes = message['changes']

        for change in changes:
            side, price, size = change

            # parse numbers and keep precision
            price = Decimal(price)
            size = Decimal(size)

            if side == 'buy':
                # If it is equal to 0 (or less than) the order no longer exists
                if size <= 0:
                    self._bids.remove(price)
                else:
                    self._bids.insert(price, size)
            elif side == 'sell':
                # If it is equal to 0 (or less than) the order no longer exists
                if size <= 0:
                    self._asks.remove(price)
                else:
                    self._asks.insert(price, size)

    def process_message(self, message: Dict):
        """
        Process all messages to identify next parser location
        :param message: json
        """
        # Read type
        msg_type = message['type']

        # dropped - not same product id
        if message.get('product_id', None) != self._product_id:
            return

        if msg_type == 'snapshot':
            self.process_snapshot(message)
        elif msg_type == 'l2update':
            self.process_update(message)

    def get_asks(self) -> List[Tuple[float, float]]:
        """
        Provides a list of asks and sizes in order of best price for the buyer

        :return: a list of Tuple's corresponding to ask (price rate), and ask size
        """
        asks = []
        for ask in self._asks:
            try:
                size = self._asks[ask]
            except KeyError:
                continue
            asks.append([float(ask), float(size)])
        return asks

    def get_bids(self) -> List[Tuple[float, float]]:
        """
       Provides a list of bids and sizes in order of best price for the seller

       :return: a list of Tuple's corresponding to ask (price rate), and ask size
       """
        bids = []
        for bid in self._bids:
            try:
                size = self._bids[bid]
            except KeyError:
                continue
            # For bids the best value (for selling) is reversed so inserting at the beginning flips the order
            bids.insert(0, [float(bid), float(size)])
        return bids

    def get_orders(self) -> Dict[str, List[Tuple[float, float]]]:
        """
        Uses get_bids and get_asks to compile all orders

        :return: both bids and asks
        """
        return {'asks': self.get_asks(), 'bids': self.get_bids()}

    def get_ask(self) -> Tuple[Decimal, Decimal]:
        """
        Get the best asking price. If it does not exist it returns a size of 0

        :return: the rate, and the size
        """
        price = self._asks.min_key()

        try:
            size = self._asks[price]
        except KeyError:
            return price, Decimal(0)

        return price, size

    def get_bid(self) -> Tuple[Decimal, Decimal]:
        """
        Get the best bid price. If it does not exist it returns a size of 0

        :return: the rate, and the size
        """
        price = self._bids.max_key()

        try:
            size = self._bids[price]
        except KeyError:
            return price, Decimal(0)

        return price, size
Exemplo n.º 3
0
class TradeTree(object):
    '''A red-black tree used to store TradeLists in price trade
    The exchange will be using the TradeTree to hold bid and ask data (one TradeTree for each side).
    Keeping the information in a red black tree makes it easier/faster to detect a match.
    '''
    def __init__(self):
        self.price_tree = FastRBTree()
        self.trade_map = {}
        self.num_trades = 0  # Contains count of Orders in tree
        self.depth = 0  # Number of different prices in tree (http://en.wikipedia.org/wiki/trade_book_(trading)#Book_depth)

    def __len__(self):
        return len(self.trade_map)

    def get_price_list(self, price):
        return self.price_tree.get(price, [])

    def get_trade(self, trade_id):
        return self.trade_map[trade_id] if trade_id in self.trade_map else None

    def create_price(self, price):
        self.depth += 1  # Add a price depth level to the tree
        new_list = LinkedList()
        self.price_tree.insert(price,
                               new_list)  # Insert a new price into the tree

    def remove_price(self, price):
        self.depth -= 1  # Remove a price depth level
        self.price_tree.remove(price)

    def price_exists(self, price):
        return self.price_tree.__contains__(price)

    def trade_exists(self, trade_id):
        return trade_id in self.trade_map

    def insert_trade(self, xtrade):
        if self.trade_exists(xtrade.id):
            return
        self.num_trades += 1
        if not self.price_exists(xtrade.limit_price):
            self.create_price(
                xtrade.limit_price
            )  # If price not in Price Map, create a node in RBtree
        self.trade_map[
            trade.id] = self.price_tree[xtrade.limit_price].append_item(
                xtrade
            )  # Add the trade to the TradeList in Price Map return the reference

    def remove_trade(self, xtrade):
        self.num_trades -= 1
        trade_node = self.trade_map[trade.id]
        self.price_tree[trade.limit_price].remove_item(trade_node)
        if len(self.price_tree[trade.limit_price]) == 0:
            self.remove_price(trade.limit_price)
        self.trade_map.pop(trade.id, None)

    def max_price(self):
        if self.depth > 0:
            return self.price_tree.max_key()
        else:
            return None

    def min_price(self):
        if self.depth > 0:
            return self.price_tree.min_key()
        else:
            return None

    def max_price_list(self):
        if self.depth > 0:
            return self.get_price_list(self.max_price())
        else:
            return None

    def min_price_list(self):
        if self.depth > 0:
            return self.get_price_list(self.min_price())
        else:
            return None
Exemplo n.º 4
0
class LOBTree:
    def __init__(self):
        '''
        Limit order book tree implementation using Red-Black tree for self-balancing 
        Each limit price level is a OrderLinkedlist, and each order contains information 
        including id, price, timestamp, volume
        self.limit_level: dict
            key: price level; value: OrderLinkedlist object
        self.order_ids: dict  
            key: order id; value: Order object
            helps to locate order by id
        '''
        # tree that store price as keys and number of orders on that level as values
        self.price_tree = FastRBTree()
        self.max_price = None
        self.min_price = None
        self.limit_levels = {}
        self.order_ids = {}

    @property
    def max(self):
        return self.max_price

    @property
    def min(self):
        return self.min_price

    def _get_price(self, price):
        '''
        price: int 
        :return: OrderLinkedlist instance 

        '''
        return self.limit_levels[price]

    def insert_order(self, order: Order):
        '''
        order: Order Instance
            If order price doesn't exist in the self.limit_levels, insert it into the price level,
            else update accordingly;
            Will be used as limit order submission
        :return: None
        '''
        if order.id in self.order_ids:
            raise ValueError('order already exists in the book')
            return

        if order.price not in self.limit_levels:
            new_price_level = OrderLinkedlist()
            self.price_tree[order.price] = 1
            self.limit_levels[order.price] = new_price_level
            self.limit_levels[order.price].set_head(order)
            self.limit_levels[order.price].size += order.size
            self.order_ids[order.id] = order
            if self.max_price is None or order.price > self.max_price:
                self.max_price = order.price
            if self.min_price is None or order.price < self.min_price:
                self.min_price = order.price
        else:
            self.limit_levels[order.price].set_head(order)
            self.limit_levels[order.price].size += order.size
            self.order_ids[order.id] = order
            self.price_tree[order.price] += 1

    def update_existing_order_size(self, order_id: int, updated_size: int):
        '''
        order_id: int
        size: int
            Update an existing order's size in a price level and its price level's overall size
        :return: None
        '''
        delta = self.order_ids[order_id].size - updated_size
        try:
            self.order_ids[order_id].size = updated_size
            order_price = self.order_ids[order_id].price
            # updated order will be put at the front of the list
            self.limit_levels[order_price].set_head(self.order_ids[order_id])
            self.limit_levels[order_price].size -= delta
        except Exception as e:
            LOG.info('Order is not in the book')

    def remove_order(self, order_id: int):
        '''
        order: Order Instance
            Remove the order from the self.order_ids first, 
            then remove it from the self.limit_levels;
            if the limit_levels is empty after removal, 
            adjust the max_price and min_price accordingly from the self.price_tree
        :return: Order Instance | order removed from the book
        '''
        popped = self.order_ids.pop(order_id)
        self.limit_levels[popped.price].remove(popped, decrement=True)
        self.price_tree[popped.price] -= 1
        if self.limit_levels[popped.price].size == 0:
            self._remove_price_level(popped.price)
        return popped

    def _remove_price_level(self, price: int):
        '''
        order: Order Instance
            Given a price level, remove the price level in the price_tree and limit_levels
            reset the max and min prices
        '''
        del self.limit_levels[price]
        self.price_tree.remove(price)
        if self.max_price == price:
            try:
                self.max_price = self.price_tree.max_key()
            except KeyError or ValueError:
                self.max_price = None
        if self.min_price == price:
            try:
                self.min_price = self.price_tree.min_key()
            except KeyError or ValueError:
                self.min_price = None

    def market_order(self, order: Order):
        '''
        order: Order Instance
        '''
        if len(self.limit_levels) == 0:
            raise ValueError('No orders in the book')
            return

        if order.is_bid:
            best_price = self.min_price
            while order.size > 0 and best_price != None:
                price_level = self._get_price(best_price)
                order.size, number_of_orders_deleted = price_level._consume_orders(
                    order, self.order_ids)
                self.price_tree[best_price] -= number_of_orders_deleted
                if price_level._head == None:
                    self._remove_price_level(best_price)
                best_price = self.min_price
            if order.size != 0:
                LOG.warning('no more limit orders in the bid book')
        else:
            best_price = self.max_price
            while order.size > 0 and best_price != None:
                price_level = self._get_price(best_price)
                order.size, number_of_orders_deleted = price_level._consume_orders(
                    order, self.order_ids)
                self.price_tree[best_price] -= number_of_orders_deleted
                if price_level._head == None:
                    self._remove_price_level(best_price)
                best_price = self.max_price
            if order.size != 0:
                LOG.warning('no more orders in the ask book')

    def level_with_most_orders(self, range: int):
        '''
        range: int
            Gives the price level with the most orders on the top levels
        '''
        pass

    def iceberg(self):
        '''
        Iceberg order type
        '''
        pass
Exemplo n.º 5
0
class BidBook(object):
    """
	A BidBook is used to store the order book's rates and amounts on the bid side with a defined depth.
	To maintain a sorted order of rates, the BidBook uses a red-black tree to store rates and corresponding amounts.
	For O(1) query of volume at a predetermined rate, the BidBook also uses a dictionary to store rate and amount.
	"""
    def __init__(self, max_depth, data):
        # RBTree: maintains sorted order of rates
        # every value inserted to RBTree must be a tuple, so we hard code the second value to be 0
        self.rate_tree = FastRBTree()

        # dict: Uses rate and amount for key value pairs
        self.rate_dict = {}

        # float: amounts summed across all rate levels in tree
        self.volume = 0

        # int: total number of rate levels in tree
        self.depth = len(data)

        # int: maximum number of rate levels in tree
        self.max_depth = max_depth

        # populate rate_tree and rate_dict from public API call data
        # set volume
        for level in data:
            rate = float(level[0])
            amount = float(level[1])
            self.rate_tree.insert(rate, 0)
            self.rate_dict[rate] = amount
            self.volume += amount

    def __len__(self):
        return len(self.rate_dict)

    def rate_exists(self, rate):
        return rate in self.rate_dict

    def get_amount_at_rate(self, rate):
        return self.rate_dict.get(rate)

    def max_rate_level(self):
        if self.depth > 0:
            rate = self.rate_tree.max_key()
            amount = self.get_amount_at_rate(rate)
            return rate, amount
        else:
            return None

    def min_rate_level(self):
        if self.depth > 0:
            rate = self.rate_tree.min_key()
            amount = self.get_amount_at_rate(rate)
            return rate, amount
        else:
            return None

    def modify(self, event):
        # if the event's rate is already in the book, just modify the amount at the event's rate
        rate = float(event[u'data'][u'rate'])
        amount = float(event[u'data'][u'amount'])
        if self.rate_exists(rate):
            # print '~~~~~~~~~~~~~~~~~~~~~~  BID MODIFY  ~~~~~~~~~~~~~~~~~~~~~~'
            self.rate_dict[rate] = amount

        # only rates not already in the book reach this logic
        # if the max depth hasn't been reached, just insert the event's rate and amount
        elif self.depth < self.max_depth:
            # print '~~~~~~~~~~~~~~~~~~~~~~  BID MODIFY  ~~~~~~~~~~~~~~~~~~~~~~'
            self.rate_tree.insert(rate, 0)
            self.rate_dict[rate] = amount
            self.depth += 1

        # only events being handled by a full order tree reach this logic
        # if the event is a bid and the rate is greater than min rate, effectively replace min rate level with event
        else:
            min_rate = self.min_rate_level()[0]
            if rate > min_rate:
                # print '~~~~~~~~~~~~~~~~~~~~~~  BID MODIFY  ~~~~~~~~~~~~~~~~~~~~~~'
                self.rate_tree.remove(min_rate)
                del self.rate_dict[min_rate]
                self.rate_tree.insert(rate, 0)
                self.rate_dict[rate] = amount

    def remove(self, event):
        # if the event's rate is in the book, delete it
        rate = float(event[u'data'][u'rate'])
        if self.rate_exists(rate):
            # print '~~~~~~~~~~~~~~~~~~~~~~  BID REMOVE  ~~~~~~~~~~~~~~~~~~~~~~'
            self.rate_tree.remove(rate)
            del self.rate_dict[rate]
            self.depth -= 1

    def __str__(self):
        rate_tree_str = '[' + ','.join(rate[0]
                                       for rate in self.rate_tree) + ']'
        return 'BIDS: ' + rate_tree_str
Exemplo n.º 6
0
class PriceTree(object):
    def __init__(self, name):
        self.tree = FastRBTree()
        self.name = name
        self.price_map = {}  # Map price -> OrderList
        self.order_map = {}  # Map order_id -> Order
        self.min_price = None
        self.max_price = None

    def insert_price(self, price):
        """
        Add a new price TreedNode and associate it with an orderList
        :param price:
        :return:
        """
        new_list = OrderList()
        self.tree.insert(price, new_list)
        self.price_map[price] = new_list
        if self.max_price is None or price > self.max_price:
            self.max_price = price
        if self.min_price is None or price < self.min_price:
            self.min_price = price

    def remove_price(self, price):
        """
        Remove price from the tree structure and the associated orderList
        Update min and max prices if needed
        :param price:
        :return:
        """
        self.tree.remove(price)
        # Order-map will still contain all Orders emptied (with size 0)
        # as we delete them on the List match_order which is fine for now
        for to_del_order in self.price_map[price]:
            del self.order_map[to_del_order.id]
        # Delete the price from the price-map
        del self.price_map[price]
        if self.max_price == price:
            try:
                self.max_price = self.tree.max_key()
            except ValueError:
                self.max_price = None
        if self.min_price == price:
            try:
                self.min_price = self.tree.min_key()
            except ValueError:
                self.min_price = None

    def insert_price_order(self, order):
        if order.price not in self.price_map:
            self.insert_price(order.price)
        # Add order to orderList
        self.price_map[order.price].add(order)
        # Also keep it in the order mapping
        self.order_map[order.id] = order

    def match_price_order(self, curr_order):
        if len(self.price_map) == 0:
            return []
        # if bid -> sell_tree min
        # if ask -> buy_tree max
        best_price = self.min if curr_order.is_bid else self.max
        complete_trades = []
        while ((curr_order.is_bid and curr_order.price >= best_price)
                or (not curr_order.is_bid and curr_order.price <= best_price)) \
                and curr_order.peak_size > 0:
            # Get price OrderList
            matching_orders_list = self.get_price(best_price)
            complete_trades.extend(
                matching_orders_list.match_order(curr_order, self.order_map))
            # Remove exhausted price
            if matching_orders_list.size == 0:
                self.remove_price(best_price)
                if len(self.price_map) == 0:
                    break
                # Try to find more price matches using the next price
                best_price = self.min if curr_order.is_bid else self.max

        return complete_trades

    def price_exists(self, price):
        return price in self.price_map

    def order_exists(self, id_num):
        return id_num in self.order_map

    def get_price(self, price):
        return self.price_map[price]

    def get_order(self, id_num):
        return self.order_map[id_num]

    @property
    def max(self):
        return self.max_price

    @property
    def min(self):
        return self.min_price
Exemplo n.º 7
0
class ExclusiveRangeDict(object):
    """A class like dict whose key is a range [begin, end) of integers.

  It has an attribute for each range of integers, for example:
  [10, 20) => Attribute(0),
  [20, 40) => Attribute(1),
  [40, 50) => Attribute(2),
  ...

  An instance of this class is accessed only via iter_range(begin, end).
  The instance is accessed as follows:

  1) If the given range [begin, end) is not covered by the instance,
  the range is newly created and iterated.

  2) If the given range [begin, end) exactly covers ranges in the instance,
  the ranges are iterated.
  (See test_set() in tests/range_dict_tests.py.)

  3) If the given range [begin, end) starts at and/or ends at a mid-point of
  an existing range, the existing range is split by the given range, and
  ranges in the given range are iterated.  For example, consider a case that
  [25, 45) is given to an instance of [20, 30), [30, 40), [40, 50).  In this
  case, [20, 30) is split into [20, 25) and [25, 30), and [40, 50) into
  [40, 45) and [45, 50).  Then, [25, 30), [30, 40), [40, 45) are iterated.
  (See test_split() in tests/range_dict_tests.py.)

  4) If the given range [begin, end) includes non-existing ranges in an
  instance, the gaps are filled with new ranges, and all ranges are iterated.
  For example, consider a case that [25, 50) is given to an instance of
  [30, 35) and [40, 45).  In this case, [25, 30), [35, 40) and [45, 50) are
  created in the instance, and then [25, 30), [30, 35), [35, 40), [40, 45)
  and [45, 50) are iterated.
  (See test_fill() in tests/range_dict_tests.py.)
  """
    class RangeAttribute(object):
        def __init__(self):
            pass

        def __str__(self):
            return '<RangeAttribute>'

        def __repr__(self):
            return '<RangeAttribute>'

        def copy(self):  # pylint: disable=R0201
            return ExclusiveRangeDict.RangeAttribute()

    def __init__(self, attr=RangeAttribute):
        self._tree = FastRBTree()
        self._attr = attr

    def iter_range(self, begin=None, end=None):
        if not begin:
            begin = self._tree.min_key()
        if not end:
            end = self._tree.max_item()[1][0]

        # Assume that self._tree has at least one element.
        if self._tree.is_empty():
            self._tree[begin] = (end, self._attr())

        # Create a beginning range (border)
        try:
            bound_begin, bound_value = self._tree.floor_item(begin)
            bound_end = bound_value[0]
            if begin >= bound_end:
                # Create a blank range.
                try:
                    new_end, _ = self._tree.succ_item(bound_begin)
                except KeyError:
                    new_end = end
                self._tree[begin] = (min(end, new_end), self._attr())
            elif bound_begin < begin and begin < bound_end:
                # Split the existing range.
                new_end = bound_value[0]
                new_value = bound_value[1]
                self._tree[bound_begin] = (begin, new_value.copy())
                self._tree[begin] = (new_end, new_value.copy())
            else:  # bound_begin == begin
                # Do nothing (just saying it clearly since this part is confusing)
                pass
        except KeyError:  # begin is less than the smallest element.
            # Create a blank range.
            # Note that we can assume self._tree has at least one element.
            self._tree[begin] = (min(end, self._tree.min_key()), self._attr())

        # Create an ending range (border)
        try:
            bound_begin, bound_value = self._tree.floor_item(end)
            bound_end = bound_value[0]
            if end > bound_end:
                # Create a blank range.
                new_begin = bound_end
                self._tree[new_begin] = (end, self._attr())
            elif bound_begin < end and end < bound_end:
                # Split the existing range.
                new_end = bound_value[0]
                new_value = bound_value[1]
                self._tree[bound_begin] = (end, new_value.copy())
                self._tree[end] = (new_end, new_value.copy())
            else:  # bound_begin == begin
                # Do nothing (just saying it clearly since this part is confusing)
                pass
        except KeyError:  # end is less than the smallest element.
            # It must not happen.  A blank range [begin,end) has already been created
            # even if [begin,end) is less than the smallest range.
            # Do nothing (just saying it clearly since this part is confusing)
            raise

        missing_ranges = []

        prev_end = None
        for range_begin, range_value in self._tree.itemslice(begin, end):
            range_end = range_value[0]
            # Note that we can assume that we have a range beginning with |begin|
            # and a range ending with |end| (they may be the same range).
            if prev_end and prev_end != range_begin:
                missing_ranges.append((prev_end, range_begin))
            prev_end = range_end

        for missing_begin, missing_end in missing_ranges:
            self._tree[missing_begin] = (missing_end, self._attr())

        for range_begin, range_value in self._tree.itemslice(begin, end):
            yield range_begin, range_value[0], range_value[1]

    def __str__(self):
        return str(self._tree)
Exemplo n.º 8
0
class TDigest(object):

    def __init__(self, delta=0.01, K=25):
        self.C = RBTree()
        self.n = 0
        self.delta = delta
        self.K = K

    def __add__(self, other_digest):
        data = list(chain(self.C.values(), other_digest.C.values()))
        new_digest = TDigest(self.delta, self.K)
        
        if len(data) > 0:
            for c in pyudorandom.items(data):
                new_digest.update(c.mean, c.count)

        return new_digest

    def __len__(self):
        return len(self.C)

    def __repr__(self):
        return """<T-Digest: n=%d, centroids=%d>""" % (self.n, len(self))

    def _add_centroid(self, centroid):
        if centroid.mean not in self.C:
            self.C.insert(centroid.mean, centroid)
        else:
            self.C[centroid.mean].update(centroid.mean, centroid.count)

    def _compute_centroid_quantile(self, centroid):
        denom = self.n
        cumulative_sum = sum(
            c_i.count for c_i in self.C.value_slice(-float('Inf'), centroid.mean))
        return (centroid.count / 2. + cumulative_sum) / denom

    def _update_centroid(self, centroid, x, w):
        self.C.pop(centroid.mean)
        centroid.update(x, w)
        self._add_centroid(centroid)

    def _find_closest_centroids(self, x):
        try:
            ceil_key = self.C.ceiling_key(x)
        except KeyError:
            floor_key = self.C.floor_key(x)
            return [self.C[floor_key]]

        try:
            floor_key = self.C.floor_key(x)
        except KeyError:
            ceil_key = self.C.ceiling_key(x)
            return [self.C[ceil_key]]

        if abs(floor_key - x) < abs(ceil_key - x):
            return [self.C[floor_key]]
        elif abs(floor_key - x) == abs(ceil_key - x) and (ceil_key != floor_key):
            return [self.C[ceil_key], self.C[floor_key]]
        else:
            return [self.C[ceil_key]]

    def _theshold(self, q):
        return 4 * self.n * self.delta * q * (1 - q)

    def update(self, x, w=1):
        """
        Update the t-digest with value x and weight w.

        """
        self.n += w

        if len(self) == 0:
            self._add_centroid(Centroid(x, w))
            return

        S = self._find_closest_centroids(x)

        while len(S) != 0 and w > 0:
            j = choice(list(range(len(S))))
            c_j = S[j]

            q = self._compute_centroid_quantile(c_j)

            # This filters the out centroids that do not satisfy the second part
            # of the definition of S. See original paper by Dunning. 
            if c_j.count + w > self._theshold(q):
                S.pop(j)
                continue

            delta_w = min(self._theshold(q) - c_j.count, w)
            self._update_centroid(c_j, x, delta_w)
            w -= delta_w
            S.pop(j)

        if w > 0:
            self._add_centroid(Centroid(x, w))

        if len(self) > self.K / self.delta:
            self.compress()

        return

    def batch_update(self, values, w=1):
        """
        Update the t-digest with an iterable of values. This assumes all points have the 
        same weight.
        """
        for x in values:
            self.update(x, w)
        self.compress()
        return

    def compress(self):
        T = TDigest(self.delta, self.K)
        C = list(self.C.values())
        for c_i in pyudorandom.items(C):
            T.update(c_i.mean, c_i.count)
        self.C = T.C

    def percentile(self, p):
        """ 
        Computes the percentile of a specific value in [0,100].

        """
        if not (0 <= p <= 100):
            raise ValueError("p must be between 0 and 100, inclusive.")

        t = 0
        p = float(p)/100.
        p *= self.n

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k = c_i.count
            if p < t + k:
                if i == 0:
                    return c_i.mean
                elif i == len(self) - 1:
                    return c_i.mean
                else:
                    delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2.
                return c_i.mean + ((p - t) / k - 0.5) * delta

            t += k
        return self.C.max_item()[1].mean

    def quantile(self, q):
        """ 
        Computes the quantile of a specific value, ie. computes F(q) where F denotes
        the CDF of the distribution. 

        """
        t = 0
        N = float(self.n)

        if len(self) == 1: # only one centroid
            return int(q >= self.C.min_key())

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            if i == len(self) - 1:
                delta = (c_i.mean - self.C.prev_item(key)[1].mean) / 2.
            else:
                delta = (self.C.succ_item(key)[1].mean - c_i.mean) / 2.
            z = max(-1, (q - c_i.mean) / delta)

            if z < 1:
                return t / N + c_i.count / N * (z + 1) / 2

            t += c_i.count
        return 1

    def trimmed_mean(self, p1, p2):
        """
        Computes the mean of the distribution between the two percentiles p1 and p2.
        This is a modified algorithm than the one presented in the original t-Digest paper. 

        """
        if not (p1 < p2):
            raise ValueError("p1 must be between 0 and 100 and less than p2.")

        s = k = t = 0
        p1 /= 100.
        p2 /= 100.
        p1 *= self.n
        p2 *= self.n
        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k_i = c_i.count
            if p1 < t + k_i:
                if t < p1:
                    nu = self.__interpolate(i,key,p1-t)
                else:
                    nu = 1
                s += nu * k_i * c_i.mean
                k += nu * k_i

            if p2 < t + k_i:
                nu = self.__interpolate(i,key,p2-t)
                s -= nu * k_i * c_i.mean
                k -= nu * k_i
                break

            t += k_i

        return s/k

    def __interpolate(self, i, key, diff):
        c_i = self.C[key]
        k_i = c_i.count

        if i == 0:
            delta = self.C.succ_item(key)[1].mean - c_i.mean
        elif i == len(self) - 1:
            delta = c_i.mean - self.C.prev_item(key)[1].mean
        else:
            delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2.
        return (diff / k_i - 0.5) * delta
Exemplo n.º 9
0
class TDigest(object):

    def __init__(self, delta=0.01, K=25, merge_sorting=None):
        self.C = RBTree()
        self.n = 0
        self.delta = delta
        self.K = K
        self.merge_sorting = merge_sorting

    def __add__(self, other_digest):
        data = list(chain(self.C.values(), other_digest.C.values()))
        new_digest = TDigest(self.delta, self.K)

        ###################################
        # New added sorting mechanisms
        ###################################
        if self.merge_sorting == "centroid_order":
            data = sorted(data, key=lambda x: x.mean, reverse=True)
        elif self.merge_sorting == "centroid_size":
            data = sorted(data, key=lambda x: x.count, reverse=True)
        ###################################
        ###################################

        if len(data) > 0:
            if self.merge_sorting is None:
                for c in pyudorandom.items(data):
                    new_digest.update(c.mean, c.count)
            else:
                for c in data:
                    new_digest.update(c.mean, c.count)

        return new_digest

    def __len__(self):
        return len(self.C)

    def __repr__(self):
        return """<T-Digest: n=%d, centroids=%d>""" % (self.n, len(self))

    def _add_centroid(self, centroid):
        if centroid.mean not in self.C:
            self.C.insert(centroid.mean, centroid)
        else:
            self.C[centroid.mean].update(centroid.mean, centroid.count)

    def _compute_centroid_quantile(self, centroid):
        denom = self.n
        cumulative_sum = sum(
            c_i.count for c_i in self.C.value_slice(-float('Inf'), centroid.mean))
        return (centroid.count / 2. + cumulative_sum) / denom

    def _update_centroid(self, centroid, x, w):
        self.C.pop(centroid.mean)
        centroid.update(x, w)
        self._add_centroid(centroid)

    def _find_closest_centroids(self, x):
        try:
            ceil_key = self.C.ceiling_key(x)
        except KeyError:
            floor_key = self.C.floor_key(x)
            return [self.C[floor_key]]

        try:
            floor_key = self.C.floor_key(x)
        except KeyError:
            ceil_key = self.C.ceiling_key(x)
            return [self.C[ceil_key]]

        if abs(floor_key - x) < abs(ceil_key - x):
            return [self.C[floor_key]]
        elif abs(floor_key - x) == abs(ceil_key - x) and (ceil_key != floor_key):
            return [self.C[ceil_key], self.C[floor_key]]
        else:
            return [self.C[ceil_key]]

    def _theshold(self, q):
        return 4 * self.n * self.delta * q * (1 - q)

    def update(self, x, w=1):
        """
        Update the t-digest with value x and weight w.

        """
        self.n += w

        if len(self) == 0:
            self._add_centroid(Centroid(x, w))
            return

        S = self._find_closest_centroids(x)

        while len(S) != 0 and w > 0:
            j = choice(list(range(len(S))))
            c_j = S[j]

            q = self._compute_centroid_quantile(c_j)

            # This filters the out centroids that do not satisfy the second part
            # of the definition of S. See original paper by Dunning.
            if c_j.count + w > self._theshold(q):
                S.pop(j)
                continue

            delta_w = min(self._theshold(q) - c_j.count, w)
            self._update_centroid(c_j, x, delta_w)
            w -= delta_w
            S.pop(j)

        if w > 0:
            self._add_centroid(Centroid(x, w))

        if len(self) > self.K / self.delta:
            self.compress()

        return

    def batch_update(self, values, w=1):
        """
        Update the t-digest with an iterable of values. This assumes all points have the
        same weight.
        """
        for x in values:
            self.update(x, w)
        self.compress()
        return

    def compress(self):
        T = TDigest(self.delta, self.K)
        C = list(self.C.values())
        for c_i in pyudorandom.items(C):
            T.update(c_i.mean, c_i.count)
        self.C = T.C

    def percentile(self, p):
        """
        Computes the percentile of a specific value in [0,100].

        """
        if not (0 <= p <= 100):
            raise ValueError("p must be between 0 and 100, inclusive.")

        t = 0
        p = float(p)/100.
        p *= self.n

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k = c_i.count
            if p < t + k:
                if i == 0:
                    return c_i.mean
                elif i == len(self) - 1:
                    return c_i.mean
                else:
                    delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2.
                return c_i.mean + ((p - t) / k - 0.5) * delta

            t += k
        return self.C.max_item()[1].mean

    def quantile(self, q):
        """
        Computes the quantile of a specific value, ie. computes F(q) where F denotes
        the CDF of the distribution.

        """
        t = 0
        N = float(self.n)

        if len(self) == 1: # only one centroid
            return int(q >= self.C.min_key())

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            if i == len(self) - 1:
                delta = (c_i.mean - self.C.prev_item(key)[1].mean) / 2.
            else:
                delta = (self.C.succ_item(key)[1].mean - c_i.mean) / 2.
            z = max(-1, (q - c_i.mean) / delta)

            if z < 1:
                return t / N + c_i.count / N * (z + 1) / 2

            t += c_i.count
        return 1

    def trimmed_mean(self, p1, p2):
        """
        Computes the mean of the distribution between the two percentiles p1 and p2.
        This is a modified algorithm than the one presented in the original t-Digest paper.

        """
        if not (p1 < p2):
            raise ValueError("p1 must be between 0 and 100 and less than p2.")

        s = k = t = 0
        p1 /= 100.
        p2 /= 100.
        p1 *= self.n
        p2 *= self.n
        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k_i = c_i.count
            if p1 < t + k_i:
                if t < p1:
                    nu = self.__interpolate(i,key,p1-t)
                else:
                    nu = 1
                s += nu * k_i * c_i.mean
                k += nu * k_i

            if p2 < t + k_i:
                nu = self.__interpolate(i,key,p2-t)
                s -= nu * k_i * c_i.mean
                k -= nu * k_i
                break

            t += k_i

        return s/k

    def __interpolate(self, i, key, diff):
        c_i = self.C[key]
        k_i = c_i.count

        if i == 0:
            delta = self.C.succ_item(key)[1].mean - c_i.mean
        elif i == len(self) - 1:
            delta = c_i.mean - self.C.prev_item(key)[1].mean
        else:
            delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2.
        return (diff / k_i - 0.5) * delta