class MMCRWPolicy(object): def __init__(self, cache_entries_limit, ghost_entries_limit, trace_size_limit, csv_suffix="_mmc.csv"): self.full_cache = FastRBTree() self.was_hit = None self.was_ghost_hit = None self.num_hits = 0 self.num_requests = 0 self.cache_entries_limit = cache_entries_limit self.ghost_entries_limit = ghost_entries_limit self.trace_size_limit = trace_size_limit self.trace = collections.deque() self.stack = RBTree() self.ranker = RBTree() self.generation = 0 # During startup, this will act like an LRU. self.startup = True self.EM_period = 50 * int(np.ceil(np.log(trace_size_limit))) self.countdown_to_EM = trace_size_limit // 2 self.tau = [0.25, 0.25, 0.25, 0.25] self.theta = [0.5, 0.5, 0.5, 0.5] self.acc_tau = [0.0, 0.0, 0.0, 0.0] self.acc_theta = [0.0, 0.0, 0.0, 0.0] self.num_in_cache = 0 self.num_in_full_cache = 0 self.num_reads = 0 self.csv_suffix = csv_suffix self.ts_order = [ 'row', 'hit', 'ghost_hit', 'tau_R_SDD', 'tau_R_IRM', 'tau_W_SDD', 'tau_W_IRM', 'theta_R_SDD', 'theta_R_IRM', 'theta_W_SDD', 'theta_W_IRM', 'depth', 'rank', 'Z_R_SDD', 'Z_R_IRM', 'Z_W_SDD', 'Z_W_IRM', 'Z_sum' ] self.ts_datapoint = {key: None for key in self.ts_order} self.ts_datapoint['row'] = 0 self.ts_file = open("csv/mmc_rw" + self.csv_suffix, "w") self.ts_writer = csv.writer(self.ts_file) self.ts_writer.writerow(self.ts_order) self.evict_order = ['row', 'depth', 'rank', 'value', 'opcode'] self.evict_datapoint = {key: None for key in self.evict_order} self.evict_datapoint['row'] = 0 self.evict_file = open("csv/mmc_rw_evict" + self.csv_suffix, "w") self.evict_writer = csv.writer(self.evict_file) self.evict_writer.writerow(self.evict_order) self.purge_order = ['row', 'depth', 'rank', 'value', 'opcode'] self.purge_datapoint = {key: None for key in self.purge_order} self.purge_datapoint['row'] = 0 self.purge_file = open("csv/mmc_rw_purge" + self.csv_suffix, "w") self.purge_writer = csv.writer(self.purge_file) self.purge_writer.writerow(self.purge_order) def request(self, page, opcode): self.num_requests += 1 self.was_hit = False self.was_ghost_hit = False node = self.get_node(page) if node: self.was_ghost_hit = True if not node.is_evicted: self.num_hits += 1 self.was_hit = True Z = self.calculate_Z(node.depth, node.rank, node.opcode) node.hit_count += Z[R_IRM] + Z[W_IRM] else: node = Node(self) node.hit_count = self.tau[R_IRM] + self.tau[W_IRM] node.page_key = page self.full_cache[page] = node if not self.was_hit: self.num_in_cache += 1 if not self.was_ghost_hit: self.num_in_full_cache += 1 else: if node.opcode == 'r': self.num_reads -= 1 if opcode == 'r': self.num_reads += 1 node.is_evicted = node.is_purged = False record = Record(self, node) self.add_trace_record(record) node.opcode = opcode if len(self.trace) > self.trace_size_limit: popped_record = self.trace.popleft() self.update_tau_and_theta_accs(record, increment=True) self.update_tau_and_theta_accs(popped_record, increment=False) self.refresh_params() popped_record.node.hit_count -= popped_record.Z[R_IRM] popped_record.node.hit_count -= popped_record.Z[W_IRM] node.restack() node.rerank() self.countdown_to_EM -= 1 if self.countdown_to_EM == 0: self.EM_algorithm(delta=0.00001) self.countdown_to_EM = self.EM_period self.startup = False if (self.num_in_cache > self.cache_entries_limit or self.num_in_full_cache > self.cache_entries_limit + self.ghost_entries_limit): self.pageout() #dump_cache(self, "exp") def add_trace_record(self, record): self.ts_datapoint['row'] = self.num_requests if self.was_hit: self.ts_datapoint['hit'] = 1 else: self.ts_datapoint['hit'] = 0 if self.was_ghost_hit: self.ts_datapoint['ghost_hit'] = 1 else: self.ts_datapoint['ghost_hit'] = 0 self.ts_datapoint['tau_R_SDD'] = self.tau[R_SDD] self.ts_datapoint['tau_R_IRM'] = self.tau[R_IRM] self.ts_datapoint['tau_W_SDD'] = self.tau[W_SDD] self.ts_datapoint['tau_W_IRM'] = self.tau[W_IRM] self.ts_datapoint['theta_R_SDD'] = self.theta[R_SDD] self.ts_datapoint['theta_R_IRM'] = self.theta[R_IRM] self.ts_datapoint['theta_W_SDD'] = self.theta[W_SDD] self.ts_datapoint['theta_W_IRM'] = self.theta[W_IRM] self.ts_datapoint['Z_R_SDD'] = record.Z[R_SDD] self.ts_datapoint['Z_R_IRM'] = record.Z[R_IRM] self.ts_datapoint['Z_W_SDD'] = record.Z[W_SDD] self.ts_datapoint['Z_W_IRM'] = record.Z[W_IRM] self.ts_datapoint['Z_sum'] = sum(record.Z) self.ts_datapoint['depth'] = record.depth self.ts_datapoint['rank'] = record.node.rank self.ts_writer.writerow( [self.ts_datapoint[key] for key in self.ts_order]) self.ts_file.flush() self.trace.append(record) def pageout(self): min_node = None min_node_value = None min_ghost = None min_ghost_value = None for depth, node in enumerate(self.stack.values()): node.depth_memo = depth for rank, node in enumerate(self.ranker.values()): node.recompute_expected_value(depth=node.depth_memo, rank=rank) value = node.expected_value if not node.is_evicted: if min_node is None or value < min_node_value: min_node = node min_node_value = value if min_ghost is None or value < min_ghost_value: min_ghost = node min_ghost_value = value if self.num_in_cache > self.cache_entries_limit: self.evict(min_node) if (self.num_in_full_cache > self.cache_entries_limit + self.ghost_entries_limit): self.purge(min_ghost) def EM_algorithm(self, delta): def abs_sum(): return sum(self.tau) + sum(self.theta) before = delta + 4.0 i = 0 # We need to detect if we're in a "nonsense" local optimum. The # algorithm will optimize to the global maximum if we aren't in one of # these cases. if (self.startup or min(self.tau) < 0.00001 or min(self.theta) < 0.00001): use_hard_Z = True else: use_hard_Z = False while abs(before - abs_sum()) > delta: before = abs_sum() hard_Z = [0.25, 0.25, 0.25, 0.25 ] if use_hard_Z and i == 0 else None self.E_step(hard_Z=hard_Z) i += 1 self.M_step() # Since we are rearranging the ranks, it's possible that we can # get into a situation where the ranks shift in a cycle such # that the tau delta is always exeeded. I've only seen this limit # hit when the trace size is very small (e.g. 10). if i > 50: break def E_step(self, hard_Z=None): """Treat self.tau and self.theta as constants.""" for node in self.full_cache.values(): node._hit_count = 0.0 for record in self.trace: if hard_Z is None: if record.node.is_purged: rank = record.node.rank_purge_memo else: rank = record.node.rank record._Z = self.calculate_Z(record.depth, rank, record.opcode) else: record._Z = hard_Z record.node._hit_count += record._Z[R_IRM] + record._Z[W_IRM] new_ranker = RBTree() for node in self.full_cache.values(): node.ranker_key = node.new_ranker_key() new_ranker[node.ranker_key] = node self.ranker = new_ranker def M_step(self): """Treat Record.Z as constant.""" self.acc_tau = [0.0 for d in range(D)] self.acc_theta = [0.0 for d in range(D)] for record in self.trace: self.update_tau_and_theta_accs(record, increment=True) self.refresh_params() def calculate_Z(self, depth, rank, opcode): Z = [0.0 for d in range(D)] H = [depth, rank, depth, rank] def num_on_hit(i): return (self.tau[i] * self.theta[i] * (1 - self.theta[i])**H[i]) def den_on_hit(i, j): acc = 0.0 for x in [i, j]: acc += num_on_hit(x) return acc if opcode is None: num = [0.0 for d in range(D)] for i in range(D): num[i] = num_on_hit(i) den = sum(num) return [n / den for n in num] elif opcode is 'r': num = [num_on_hit(R_SDD), num_on_hit(R_IRM)] den = den_on_hit(R_SDD, R_IRM) try: return [num[0] / den, num[1] / den, 0.0, 0.0] except ZeroDivisionError: return [0.5, 0.5, 0.0, 0.0] elif opcode is 'w': num = [num_on_hit(W_SDD), num_on_hit(W_IRM)] den = den_on_hit(W_SDD, W_IRM) try: return [0.0, 0.0, num[0] / den, num[1] / den] except ZeroDivisionError: return [0.0, 0.0, 0.5, 0.5] def refresh_params(self): R = len(self.trace) self.tau = [self.acc_tau[d] / R for d in range(D)] self.theta = [0.0, 0.0, 0.0, 0.0] for d in range(D): try: self.theta[d] = (R * self.tau[d] / (R * self.tau[d] + self.acc_theta[d])) except ZeroDivisionError as err: pass def _update_tau_and_theta_accs(self, Z, depth, rank, increment=True): H = [depth, rank, depth, rank] if increment: self.acc_tau = [self.acc_tau[d] + Z[d] for d in range(D)] self.acc_theta = [ self.acc_theta[d] + Z[d] * H[d] for d in range(D) ] else: self.acc_tau = [self.acc_tau[d] - Z[d] for d in range(D)] self.acc_theta = [ max(0.0, self.acc_theta[d] - Z[d] * H[d]) for d in range(D) ] def update_tau_and_theta_accs(self, record, increment=True): if record.node.is_purged: rank = record.node.rank_purge_memo else: rank = record.node.rank self._update_tau_and_theta_accs(record.Z, record.depth, rank, increment) def evict(self, node): self.evict_datapoint['row'] += 1 self.evict_datapoint['depth'] = node.depth self.evict_datapoint['rank'] = node.rank self.evict_datapoint['value'] = node.expected_value self.evict_datapoint['opcode'] = node.opcode self.evict_writer.writerow( [self.evict_datapoint[key] for key in self.evict_order]) self.evict_file.flush() self.num_in_cache -= 1 node.is_evicted = True def purge(self, node): self.purge_datapoint['row'] += 1 self.purge_datapoint['depth'] = node.depth self.purge_datapoint['rank'] = node.rank self.purge_datapoint['value'] = node.expected_value self.purge_datapoint['opcode'] = node.opcode self.purge_writer.writerow( [self.purge_datapoint[key] for key in self.purge_order]) self.purge_file.flush() self.num_in_full_cache -= 1 if node.opcode == 'r': self.num_reads -= 1 node.purge() @property def cache_list(self): return filter(lambda node: not node.is_evicted, self.full_cache_list) @property def full_cache_list(self): return list(self.full_cache.values()) def hit_rate(self): return float(self.num_hits) / self.num_requests def get_node(self, page): try: node = self.full_cache[page] return node except KeyError: return None
class Tdigest(object): def __init__(self, delta=0.01, K=25, CX=1.1): self.delta = delta self.K = K self.CX = CX self.centroids = RBTree() self.nreset = 0 self.reset() def reset(self): self.centroids.clear() self.n = 0 self.nreset += 1 self.last_cumulate = 0 self.compressing = False def push(self, x, n=1): if not isinstance(x, list): x = [x] for item in x: self._digest(item, n) def percentile(self, p): if self.size() == 0: return None self._cumulate(True) cumn = self.n * p lower = self.centroids.min_item()[1] upper = self.centroids.max_item()[1] for c in self.centroids.values(): if c.cumn <= cumn: lower = c else: upper = c break if lower == upper: return lower.mean return lower.mean + (cumn - lower.cumn) * (upper.mean - lower.mean) / \ (upper.cumn - lower.cumn) def serialize(self): result = '%s~%s~%s~' % (self.delta, self.K, self.size()) if self.size() == 0: return result self._cumulate(True) means = [] counts = [] for c in self.centroids.values(): means.append(str(c.mean)) counts.append(str(c.n)) return '%s%s~%s' % (result, '~'.join(means), '~'.join(counts)) @classmethod def deserialize(cls, serialized_str): if not isinstance(serialized_str, basestring): raise Exception(u'serialized_str must be str') data = serialized_str.split('~') t = Tdigest(delta=float(data[0]), K=int(data[1])) size = int(data[2]) for i in xrange(size): t.push(float(data[i + 3]), int(data[size + i + 3])) t._cumulate(True) return t def _digest(self, x, n): if self.size() == 0: self._new_centroid(x, n, 0) else: _min = self.centroids.min_item()[1] _max = self.centroids.max_item()[1] nearest = self.find_nearest(x) if nearest and nearest.mean == x: self._addweight(nearest, x, n) elif nearest == _min: self._new_centroid(x, n, 0) elif nearest == _max: self._new_centroid(x, n, self.n) else: p = (nearest.cumn + nearest.n / 2.0) / self.n max_n = int(4 * self.n * self.delta * p * (1 - p)) if max_n >= nearest.n + n: self._addweight(nearest, x, n) else: self._new_centroid(x, n, nearest.cumn) self._cumulate(False) if self.K and self.size() > self.K / self.delta: self.compress() def find_nearest(self, x): if self.size() == 0: return None try: lower = self.centroids.ceiling_item(x)[1] except KeyError: lower = None if lower and lower.mean == x: return lower try: prev = self.centroids.floor_item(x)[1] except KeyError: prev = None if not lower: return prev if not prev: return lower if abs(prev.mean - x) < abs(lower.mean - x): return prev else: return lower def size(self): return len(self.centroids) def compress(self): if self.compressing: return points = self.toList() self.reset() self.compressing = True for point in sorted(points, key=lambda x: random()): self.push(point['mean'], point['n']) self._cumulate(True) self.compressing = False def _cumulate(self, exact): if self.n == self.last_cumulate: return if not exact and self.CX and self.last_cumulate and \ self.CX > (self.n / self.last_cumulate): return cumn = 0 for c in self.centroids.values(): cumn = c.cumn = cumn + c.n self.n = self.last_cumulate = cumn def toList(self): return [dict(mean=c.mean, n=c.n, cumn=c.cumn) for c in self.centroids.values()] def _addweight(self, nearest, x, n): if x != nearest.mean: nearest.mean += n * (x - nearest.mean) / (nearest.n + n) nearest.cumn += n nearest.n += n self.n += n def _new_centroid(self, x, n, cumn): c = Centroid(x, n, cumn) self.centroids.insert(x, c) self.n += n return c
class MMCPolicy(object): def __init__(self, cache_entries_limit, ghost_entries_limit, trace_size_limit, csv_suffix="_mmc.csv", draw_dump=False): self.full_cache = FastRBTree() self.was_hit = None self.was_ghost_hit = None self.num_hits = 0 self.num_requests = 0 self.cache_entries_limit = cache_entries_limit self.ghost_entries_limit = ghost_entries_limit self.trace_size_limit = trace_size_limit self.trace = collections.deque() self.stack = RBTree() self.ranker = RBTree() self.generation = 0 # During startup, this will act like an LRU. self.startup = True self.EM_period = 50 * int(np.ceil(np.log(trace_size_limit))) self.countdown_to_EM = trace_size_limit // 2 self.tau = [0.5, 0.5] self.theta = [0.5, 0.5] self.acc_tau = [0.0] self.acc_theta = [0.0, 0.0] self.num_in_cache = 0 self.num_in_full_cache = 0 self.csv_suffix = csv_suffix self.draw_dump = draw_dump self.ts_order = [ 'row', 'hit', 'ghost_hit', 'tau', 'theta0', 'theta1', 'Z', 'depth', 'rank'] self.ts_datapoint = {key: None for key in self.ts_order} self.ts_datapoint['row'] = 0 self.ts_file = open("csv/mmc" + self.csv_suffix, "w") self.ts_writer = csv.writer(self.ts_file) self.ts_writer.writerow(self.ts_order) self.evict_order = [ 'row', 'depth', 'rank', 'value', 'Z', 'tau'] self.evict_datapoint = {key: None for key in self.evict_order} self.evict_datapoint['row'] = 0 self.evict_file = open("csv/mmc_evict" + self.csv_suffix, "w") self.evict_writer = csv.writer(self.evict_file) self.evict_writer.writerow(self.evict_order) self.purge_order = ['row', 'depth', 'rank', 'value', 'Z'] self.purge_datapoint = {key: None for key in self.purge_order} self.purge_datapoint['row'] = 0 self.purge_file = open("csv/mmc_purge" + self.csv_suffix, "w") self.purge_writer = csv.writer(self.purge_file) self.purge_writer.writerow(self.purge_order) def request(self, page): self.num_requests += 1 self.was_hit = False self.was_ghost_hit = False node = self.get_node(page) if node: self.was_ghost_hit = True if not node.is_evicted: self.num_hits += 1 self.was_hit = True node.hit_count += 1.0 - self.calculate_Z(node.depth, node.rank) else: node = Node(self) node.hit_count = self.tau[1] node.page_key = page self.full_cache[page] = node if not self.was_hit: self.num_in_cache += 1 if not self.was_ghost_hit: self.num_in_full_cache += 1 node.is_evicted = node.is_purged = False record = Record(self, node) self.add_trace_record(record) if len(self.trace) > self.trace_size_limit: popped_record = self.trace.popleft() self.update_tau_and_theta_accs(record, increment=True) self.update_tau_and_theta_accs(popped_record, increment=False) self.refresh_params() popped_record.node.hit_count -= 1.0 - popped_record.Z node.restack() node.rerank() self.countdown_to_EM -= 1 if self.countdown_to_EM == 0: self.EM_algorithm(delta=0.00001) self.countdown_to_EM = self.EM_period self.startup = False if ( self.num_in_cache > self.cache_entries_limit or self.num_in_full_cache > self.cache_entries_limit + self.ghost_entries_limit ): self.pageout() if self.draw_dump: dump_cache(self, self.csv_suffix) def add_trace_record(self, record): self.ts_datapoint['row'] = self.num_requests if self.was_hit: self.ts_datapoint['hit'] = 1 else: self.ts_datapoint['hit'] = 0 if self.was_ghost_hit: self.ts_datapoint['ghost_hit'] = 1 else: self.ts_datapoint['ghost_hit'] = 0 self.ts_datapoint['tau'] = self.tau[0] self.ts_datapoint['theta0'] = self.theta[0] self.ts_datapoint['theta1'] = self.theta[1] depth = record.depth self.ts_datapoint['depth'] = depth self.ts_datapoint['rank'] = record.node.rank self.ts_datapoint['Z'] = record.Z self.ts_writer.writerow( [self.ts_datapoint[key] for key in self.ts_order]) self.ts_file.flush() self.trace.append(record) def pageout(self): min_node = None min_node_value = None min_ghost = None min_ghost_value = None for depth, node in enumerate(self.stack.values()): node.depth_memo = depth for rank, node in enumerate(self.ranker.values()): node.recompute_expected_value(depth=node.depth_memo, rank=rank) value = node.expected_value if not node.is_evicted: if min_node is None or value < min_node_value: min_node = node min_node_value = value if min_ghost is None or value < min_ghost_value: min_ghost = node min_ghost_value = value if self.num_in_cache > self.cache_entries_limit: self.evict(min_node) if ( self.num_in_full_cache > self.cache_entries_limit + self.ghost_entries_limit ): self.purge(min_ghost) def EM_algorithm(self, delta): def abs_sum(): return abs(self.tau[0]) + abs(self.theta[0]) + abs(self.theta[1]) before = delta + 4.0 i = 0 # We need to detect if we're in a "nonsense" local optimum. The # algorithm will optimize to the global maximum if we aren't in one of # these cases. if (self.startup or self.tau[0] == 0.0 or self.tau[0] == 1.0 or self.theta[0] == 0.0 or self.theta[0] == 0.0 ): use_hard_Z = True else: use_hard_Z = False while abs(before - abs_sum()) > delta: before = abs_sum() hard_Z = 0.5 if use_hard_Z and i == 0 else None self.E_step(hard_Z=hard_Z) i += 1 self.M_step() # Since we are rearranging the ranks, it's possible that we can # get into a situation where the ranks shift in a cycle such # that the tau delta is always exeeded. I've only seen this limit # hit when the trace size is very small (e.g. 10). if i > 50: break def E_step(self, hard_Z=None): """Treat self.tau and self.theta as constants.""" for node in self.full_cache.values(): node._hit_count = 0.0 for record in self.trace: if hard_Z is None: if record.node.is_purged: rank = record.node.rank_purge_memo else: rank = record.node.rank record._Z = self.calculate_Z(record.depth, rank) else: record._Z = hard_Z record.node._hit_count += (1.0 - record._Z) new_ranker = RBTree() for node in self.full_cache.values(): node.ranker_key = node.new_ranker_key() new_ranker[node.ranker_key] = node self.ranker = new_ranker def M_step(self): """Treat Record.Z as constant.""" self.acc_tau = [0.0] self.acc_theta = [0.0, 0.0] for record in self.trace: self.update_tau_and_theta_accs(record, increment=True) self.refresh_params() def calculate_Z(self, depth, rank): numerator = ( self.tau[0] * self.theta[0] * (1 - self.theta[0])**depth) denominator = ( numerator + self.tau[1] * self.theta[1] * (1 - self.theta[1])**rank) try: return float(numerator) / denominator except ZeroDivisionError as err: # This can happen when a node falls off the trace and rank and # depth become greater than the limits. return self.tau[0] def refresh_params(self): R = len(self.trace) self.tau[0] = self.acc_tau[0] / R self.tau[1] = 1.0 - self.tau[0] try: self.theta[0] = ((R * self.tau[0]) / (R * self.tau[0] + self.acc_theta[0])) except ZeroDivisionError: self.theta[0] = 1.0 / len(self.full_cache) try: self.theta[1] = ((R * self.tau[1]) / (R * self.tau[1] + self.acc_theta[1])) except ZeroDivisionError: self.theta[1] = 1.0 / len(self.full_cache) def _update_tau_and_theta_accs(self, Z, depth, rank, increment=True): if increment: self.acc_tau[0] += Z self.acc_theta[0] += Z * depth self.acc_theta[1] += (1.0 - Z) * rank else: self.acc_tau[0] -= Z self.acc_theta[0] -= Z * depth self.acc_theta[1] -= (1.0 - Z) * rank self.acc_theta = [max(0.0, acc) for acc in self.acc_theta] def update_tau_and_theta_accs(self, record, increment=True): depth = record.depth if record.node.is_purged: rank = record.node.rank_purge_memo else: rank = record.node.rank self._update_tau_and_theta_accs(record.Z, depth, rank, increment) def evict(self, node): self.evict_datapoint['row'] += 1 self.evict_datapoint['depth'] = node.depth self.evict_datapoint['rank'] = node.rank self.evict_datapoint['value'] = node.expected_value self.evict_datapoint['Z'] = self.calculate_Z( node.depth, node.rank) self.evict_datapoint['tau'] = self.tau[0] self.evict_writer.writerow( [self.evict_datapoint[key] for key in self.evict_order]) self.evict_file.flush() self.num_in_cache -= 1 node.is_evicted = True def purge(self, node): self.purge_datapoint['row'] += 1 self.purge_datapoint['depth'] = node.depth self.purge_datapoint['rank'] = node.rank self.purge_datapoint['value'] = node.expected_value self.purge_datapoint['Z'] = self.calculate_Z( node.depth, node.rank) self.purge_writer.writerow( [self.purge_datapoint[key] for key in self.purge_order]) self.purge_file.flush() self.num_in_full_cache -= 1 node.purge() @property def cache_list(self): return filter(lambda node: not node.is_evicted, self.full_cache_list) @property def full_cache_list(self): return list(self.full_cache.values()) def hit_rate(self): return float(self.num_hits) / self.num_requests def get_node(self, page): try: node = self.full_cache[page] return node except KeyError: return None
class TDigest(object): def __init__(self, delta=0.01, K=25): self.C = RBTree() self.n = 0 self.delta = delta self.K = K def __add__(self, other_digest): C1 = list(self.C.values()) C2 = list(other_digest.C.values()) shuffle(C1) shuffle(C2) data = C1 + C2 new_digest = TDigest(self.delta, self.K) for c in data: new_digest.update(c.mean, c.count) return new_digest def __len__(self): return len(self.C) def __repr__(self): return """<T-Digest: n=%d, centroids=%d>""" % (self.n, len(self)) def _add_centroid(self, centroid): if centroid.mean not in self.C: self.C.insert(centroid.mean, centroid) else: self.C[centroid.mean].update(centroid.mean, centroid.count) def _compute_centroid_quantile(self, centroid): denom = self.n cumulative_sum = sum( c_i.count for c_i in self.C.value_slice(-float('Inf'), centroid.mean)) return (centroid.count / 2. + cumulative_sum) / denom def _update_centroid(self, centroid, x, w): self.C.pop(centroid.mean) centroid.update(x, w) self._add_centroid(centroid) def _find_closest_centroids(self, x): try: ceil_key = self.C.ceiling_key(x) except KeyError: floor_key = self.C.floor_key(x) return [self.C[floor_key]] try: floor_key = self.C.floor_key(x) except KeyError: ceil_key = self.C.ceiling_key(x) return [self.C[ceil_key]] if abs(floor_key - x) < abs(ceil_key - x): return [self.C[floor_key]] elif abs(floor_key - x) == abs(ceil_key - x) and (ceil_key != floor_key): return [self.C[ceil_key], self.C[floor_key]] else: return [self.C[ceil_key]] def _theshold(self, q): return 4 * self.n * self.delta * q * (1 - q) def update(self, x, w=1): """ Update the t-digest with value x and weight w. """ self.n += w if len(self) == 0: self._add_centroid(Centroid(x, w)) return S = self._find_closest_centroids(x) while len(S) != 0 and w > 0: j = choice(list(range(len(S)))) c_j = S[j] q = self._compute_centroid_quantile(c_j) # This filters the out centroids that do not satisfy the second part # of the definition of S. See original paper by Dunning. if c_j.count + w > self._theshold(q): S.pop(j) continue delta_w = min(self._theshold(q) - c_j.count, w) self._update_centroid(c_j, x, delta_w) w -= delta_w S.pop(j) if w > 0: self._add_centroid(Centroid(x, w)) if len(self) > self.K / self.delta: self.compress() return def batch_update(self, values, w=1): """ Update the t-digest with an iterable of values. This assumes all points have the same weight. """ for x in values: self.update(x, w) self.compress() return def compress(self): T = TDigest(self.delta, self.K) C = list(self.C.values()) shuffle(C) for c_i in C: T.update(c_i.mean, c_i.count) self.C = T.C def percentile(self, q): """ Computes the percentile of a specific value in [0,1], ie. computes F^{-1}(q) where F^{-1} denotes the inverse CDF of the distribution. """ if not (0 <= q <= 1): raise ValueError("q must be between 0 and 1, inclusive.") t = 0 q *= self.n for i, key in enumerate(self.C.keys()): c_i = self.C[key] k = c_i.count if q < t + k: if i == 0: return c_i.mean elif i == len(self) - 1: return c_i.mean else: delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2. return c_i.mean + ((q - t) / k - 0.5) * delta t += k return self.C.max_item()[1].mean def quantile(self, q): """ Computes the quantile of a specific value, ie. computes F(q) where F denotes the CDF of the distribution. """ t = 0 N = float(self.n) for i, key in enumerate(self.C.keys()): c_i = self.C[key] if i == len(self) - 1: delta = (c_i.mean - self.C.prev_item(key)[1].mean) / 2. else: delta = (self.C.succ_item(key)[1].mean - c_i.mean) / 2. z = max(-1, (q - c_i.mean) / delta) if z < 1: return t / N + c_i.count / N * (z + 1) / 2 t += c_i.count return 1 def trimmed_mean(self, q1, q2): """ Computes the mean of the distribution between the two percentiles q1 and q2. This is a modified algorithm than the one presented in the original t-Digest paper. """ if not (q1 < q2): raise ValueError("q must be between 0 and 1, inclusive.") s = k = t = 0 q1 *= self.n q2 *= self.n for i, key in enumerate(self.C.keys()): c_i = self.C[key] k_i = c_i.count if q1 < t + k_i: if i == 0: delta = self.C.succ_item(key)[1].mean - c_i.mean elif i == len(self) - 1: delta = c_i.mean - self.C.prev_item(key)[1].mean else: delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2. nu = ((q1 - t) / k_i - 0.5) * delta s += nu * k_i * c_i.mean k += nu * k_i if q2 < t + k_i: return s/k t += k_i return s/k
class MMCRWPolicy(object): def __init__(self, cache_entries_limit, ghost_entries_limit, trace_size_limit, csv_suffix="_mmc.csv"): self.full_cache = FastRBTree() self.was_hit = None self.was_ghost_hit = None self.num_hits = 0 self.num_requests = 0 self.cache_entries_limit = cache_entries_limit self.ghost_entries_limit = ghost_entries_limit self.trace_size_limit = trace_size_limit self.trace = collections.deque() self.stack = RBTree() self.ranker = RBTree() self.generation = 0 # During startup, this will act like an LRU. self.startup = True self.EM_period = 50 * int(np.ceil(np.log(trace_size_limit))) self.countdown_to_EM = trace_size_limit // 2 self.tau = [0.25, 0.25, 0.25, 0.25] self.theta = [0.5, 0.5, 0.5, 0.5] self.acc_tau = [0.0, 0.0, 0.0, 0.0] self.acc_theta = [0.0, 0.0, 0.0, 0.0] self.num_in_cache = 0 self.num_in_full_cache = 0 self.num_reads = 0 self.csv_suffix = csv_suffix self.ts_order = [ 'row', 'hit', 'ghost_hit', 'tau_R_SDD', 'tau_R_IRM', 'tau_W_SDD', 'tau_W_IRM', 'theta_R_SDD', 'theta_R_IRM', 'theta_W_SDD', 'theta_W_IRM', 'depth', 'rank', 'Z_R_SDD', 'Z_R_IRM', 'Z_W_SDD', 'Z_W_IRM', 'Z_sum' ] self.ts_datapoint = {key: None for key in self.ts_order} self.ts_datapoint['row'] = 0 self.ts_file = open("csv/mmc_rw" + self.csv_suffix, "w") self.ts_writer = csv.writer(self.ts_file) self.ts_writer.writerow(self.ts_order) self.evict_order = [ 'row', 'depth', 'rank', 'value', 'opcode'] self.evict_datapoint = {key: None for key in self.evict_order} self.evict_datapoint['row'] = 0 self.evict_file = open("csv/mmc_rw_evict" + self.csv_suffix, "w") self.evict_writer = csv.writer(self.evict_file) self.evict_writer.writerow(self.evict_order) self.purge_order = ['row', 'depth', 'rank', 'value', 'opcode'] self.purge_datapoint = {key: None for key in self.purge_order} self.purge_datapoint['row'] = 0 self.purge_file = open("csv/mmc_rw_purge" + self.csv_suffix, "w") self.purge_writer = csv.writer(self.purge_file) self.purge_writer.writerow(self.purge_order) def request(self, page, opcode): self.num_requests += 1 self.was_hit = False self.was_ghost_hit = False node = self.get_node(page) if node: self.was_ghost_hit = True if not node.is_evicted: self.num_hits += 1 self.was_hit = True Z = self.calculate_Z(node.depth, node.rank, node.opcode) node.hit_count += Z[R_IRM] + Z[W_IRM] else: node = Node(self) node.hit_count = self.tau[R_IRM] + self.tau[W_IRM] node.page_key = page self.full_cache[page] = node if not self.was_hit: self.num_in_cache += 1 if not self.was_ghost_hit: self.num_in_full_cache += 1 else: if node.opcode == 'r': self.num_reads -= 1 if opcode == 'r': self.num_reads += 1 node.is_evicted = node.is_purged = False record = Record(self, node) self.add_trace_record(record) node.opcode = opcode if len(self.trace) > self.trace_size_limit: popped_record = self.trace.popleft() self.update_tau_and_theta_accs(record, increment=True) self.update_tau_and_theta_accs(popped_record, increment=False) self.refresh_params() popped_record.node.hit_count -= popped_record.Z[R_IRM] popped_record.node.hit_count -= popped_record.Z[W_IRM] node.restack() node.rerank() self.countdown_to_EM -= 1 if self.countdown_to_EM == 0: self.EM_algorithm(delta=0.00001) self.countdown_to_EM = self.EM_period self.startup = False if ( self.num_in_cache > self.cache_entries_limit or self.num_in_full_cache > self.cache_entries_limit + self.ghost_entries_limit ): self.pageout() #dump_cache(self, "exp") def add_trace_record(self, record): self.ts_datapoint['row'] = self.num_requests if self.was_hit: self.ts_datapoint['hit'] = 1 else: self.ts_datapoint['hit'] = 0 if self.was_ghost_hit: self.ts_datapoint['ghost_hit'] = 1 else: self.ts_datapoint['ghost_hit'] = 0 self.ts_datapoint['tau_R_SDD'] = self.tau[R_SDD] self.ts_datapoint['tau_R_IRM'] = self.tau[R_IRM] self.ts_datapoint['tau_W_SDD'] = self.tau[W_SDD] self.ts_datapoint['tau_W_IRM'] = self.tau[W_IRM] self.ts_datapoint['theta_R_SDD'] = self.theta[R_SDD] self.ts_datapoint['theta_R_IRM'] = self.theta[R_IRM] self.ts_datapoint['theta_W_SDD'] = self.theta[W_SDD] self.ts_datapoint['theta_W_IRM'] = self.theta[W_IRM] self.ts_datapoint['Z_R_SDD'] = record.Z[R_SDD] self.ts_datapoint['Z_R_IRM'] = record.Z[R_IRM] self.ts_datapoint['Z_W_SDD'] = record.Z[W_SDD] self.ts_datapoint['Z_W_IRM'] = record.Z[W_IRM] self.ts_datapoint['Z_sum'] = sum(record.Z) self.ts_datapoint['depth'] = record.depth self.ts_datapoint['rank'] = record.node.rank self.ts_writer.writerow( [self.ts_datapoint[key] for key in self.ts_order]) self.ts_file.flush() self.trace.append(record) def pageout(self): min_node = None min_node_value = None min_ghost = None min_ghost_value = None for depth, node in enumerate(self.stack.values()): node.depth_memo = depth for rank, node in enumerate(self.ranker.values()): node.recompute_expected_value(depth=node.depth_memo, rank=rank) value = node.expected_value if not node.is_evicted: if min_node is None or value < min_node_value: min_node = node min_node_value = value if min_ghost is None or value < min_ghost_value: min_ghost = node min_ghost_value = value if self.num_in_cache > self.cache_entries_limit: self.evict(min_node) if ( self.num_in_full_cache > self.cache_entries_limit + self.ghost_entries_limit ): self.purge(min_ghost) def EM_algorithm(self, delta): def abs_sum(): return sum(self.tau) + sum(self.theta) before = delta + 4.0 i = 0 # We need to detect if we're in a "nonsense" local optimum. The # algorithm will optimize to the global maximum if we aren't in one of # these cases. if (self.startup or min(self.tau) < 0.00001 or min(self.theta) < 0.00001 ): use_hard_Z = True else: use_hard_Z = False while abs(before - abs_sum()) > delta: before = abs_sum() hard_Z = [0.25, 0.25, 0.25, 0.25] if use_hard_Z and i == 0 else None self.E_step(hard_Z=hard_Z) i += 1 self.M_step() # Since we are rearranging the ranks, it's possible that we can # get into a situation where the ranks shift in a cycle such # that the tau delta is always exeeded. I've only seen this limit # hit when the trace size is very small (e.g. 10). if i > 50: break def E_step(self, hard_Z=None): """Treat self.tau and self.theta as constants.""" for node in self.full_cache.values(): node._hit_count = 0.0 for record in self.trace: if hard_Z is None: if record.node.is_purged: rank = record.node.rank_purge_memo else: rank = record.node.rank record._Z = self.calculate_Z(record.depth, rank, record.opcode) else: record._Z = hard_Z record.node._hit_count += record._Z[R_IRM] + record._Z[W_IRM] new_ranker = RBTree() for node in self.full_cache.values(): node.ranker_key = node.new_ranker_key() new_ranker[node.ranker_key] = node self.ranker = new_ranker def M_step(self): """Treat Record.Z as constant.""" self.acc_tau = [0.0 for d in range(D)] self.acc_theta = [0.0 for d in range(D)] for record in self.trace: self.update_tau_and_theta_accs(record, increment=True) self.refresh_params() def calculate_Z(self, depth, rank, opcode): Z = [0.0 for d in range(D)] H = [depth, rank, depth, rank] def num_on_hit(i): return (self.tau[i] * self.theta[i] * (1 - self.theta[i])**H[i]) def den_on_hit(i, j): acc = 0.0 for x in [i, j]: acc += num_on_hit(x) return acc if opcode is None: num = [0.0 for d in range(D)] for i in range(D): num[i] = num_on_hit(i) den = sum(num) return [n / den for n in num] elif opcode is 'r': num = [num_on_hit(R_SDD), num_on_hit(R_IRM)] den = den_on_hit(R_SDD, R_IRM) try: return [num[0] / den, num[1] / den, 0.0, 0.0] except ZeroDivisionError: return [0.5, 0.5, 0.0, 0.0] elif opcode is 'w': num = [num_on_hit(W_SDD), num_on_hit(W_IRM)] den = den_on_hit(W_SDD, W_IRM) try: return [0.0, 0.0, num[0] / den, num[1] / den] except ZeroDivisionError: return [0.0, 0.0, 0.5, 0.5] def refresh_params(self): R = len(self.trace) self.tau = [self.acc_tau[d] / R for d in range(D)] self.theta = [0.0, 0.0, 0.0, 0.0] for d in range(D): try: self.theta[d] = (R * self.tau[d] / (R * self.tau[d] + self.acc_theta[d])) except ZeroDivisionError as err: pass def _update_tau_and_theta_accs(self, Z, depth, rank, increment=True): H = [depth, rank, depth, rank] if increment: self.acc_tau = [self.acc_tau[d] + Z[d] for d in range(D)] self.acc_theta = [self.acc_theta[d] + Z[d] * H[d] for d in range(D)] else: self.acc_tau = [self.acc_tau[d] - Z[d] for d in range(D)] self.acc_theta = [max(0.0, self.acc_theta[d] - Z[d] * H[d]) for d in range(D)] def update_tau_and_theta_accs(self, record, increment=True): if record.node.is_purged: rank = record.node.rank_purge_memo else: rank = record.node.rank self._update_tau_and_theta_accs(record.Z, record.depth, rank, increment) def evict(self, node): self.evict_datapoint['row'] += 1 self.evict_datapoint['depth'] = node.depth self.evict_datapoint['rank'] = node.rank self.evict_datapoint['value'] = node.expected_value self.evict_datapoint['opcode'] = node.opcode self.evict_writer.writerow( [self.evict_datapoint[key] for key in self.evict_order]) self.evict_file.flush() self.num_in_cache -= 1 node.is_evicted = True def purge(self, node): self.purge_datapoint['row'] += 1 self.purge_datapoint['depth'] = node.depth self.purge_datapoint['rank'] = node.rank self.purge_datapoint['value'] = node.expected_value self.purge_datapoint['opcode'] = node.opcode self.purge_writer.writerow( [self.purge_datapoint[key] for key in self.purge_order]) self.purge_file.flush() self.num_in_full_cache -= 1 if node.opcode == 'r': self.num_reads -= 1 node.purge() @property def cache_list(self): return filter(lambda node: not node.is_evicted, self.full_cache_list) @property def full_cache_list(self): return list(self.full_cache.values()) def hit_rate(self): return float(self.num_hits) / self.num_requests def get_node(self, page): try: node = self.full_cache[page] return node except KeyError: return None
class MINPolicy(object): def __init__(self, cache_size_limit, trace, csv_suffix=".csv"): self.cache_size_limit = cache_size_limit self.cache = {} self.hits = 0.0 self.requests = 0.0 self.ts_order = ['row', 'hit'] self.ts_datapoint = {key: None for key in self.ts_order} self.ts_datapoint['row'] = 0 self.ts_file = open("csv/min" + csv_suffix, "w") self.ts_writer = csv.writer(self.ts_file) self.ts_writer.writerow(self.ts_order) self.clairvoyance = FastRBTree() self.precog = FastRBTree() last_time = time.time() for i, page_opcode in enumerate(trace): if time.time() > last_time + 0.1: last_time = time.time() print '1', i, '\r', sys.stdout.flush() page, _ = page_opcode try: self.precog[page].append(i) except KeyError: self.precog[page] = collections.deque() self.precog[page].append(i) known_max = i known_max += 2 for times in self.precog.values(): times.append(known_max) known_max += 1 print print 'Done loading.' def hit_rate(self): return self.hits / self.requests def request(self, page): self.requests += 1 if page in self.cache: was_hit = True self.hits += 1 else: was_hit = False self.cache[page] = self.precog[page].popleft() # This happens on startup. if self.cache[page] < self.requests: self.cache[page] = self.precog[page].popleft() self.clairvoyance[self.cache[page]] = page self.ts_datapoint['row'] += 1 if was_hit: self.ts_datapoint['hit'] = 1 else: self.ts_datapoint['hit'] = 0 self.ts_writer.writerow( [self.ts_datapoint[key] for key in self.ts_order]) self.ts_file.flush() if len(self.cache) > self.cache_size_limit: next_use, page = self.clairvoyance.pop_max() del self.cache[page]
class Master(object): __metaclass__ = ProcessMeta def __init__(self, node_timeout): self._logger = logging.getLogger(self.__class__.__name__) self._nodes = {} self._sessions = {} self._sessions_by_owner = {} self._keepalive_queue = FastRBTree() self._priority_queue = FastRBTree() self._node_timeout = node_timeout self._culling_timer = runtime.greenpool.spawn(self._cull_dead_nodes) def get_session(self, name, owner=None, dep_server=None, work_dir=None, worker_count=None, init=None): try: session = self._sessions[name] session.dep_cache.set_dependency_server(dep_server) return session except KeyError: if owner is None: raise ValueError("An owner must be provided for new sessions") if work_dir is None: raise ValueError("Valid working directory required to create a new session") if dep_server is None: raise ValueError("Dependency server must be provided to create a new session") session = Session(name, owner, dep_server, worker_count, self._spawn_workers, work_dir, init) self._sessions[name] = session self._sessions_by_owner.setdefault(owner, {})[name] = session return RemoteCloud(name, owner, session.hub, session.created_on, len(session.workers), self) def _spawn_workers(self, name, owner, worker_count, init): all_nodes = itertools.imap(lambda nd: nd.itervalues(), self._priority_queue.values()) all_nodes = itertools.chain.from_iterable(all_nodes) node_pool = NodePool(all_nodes, name, self._logger) node_pool_size = len(node_pool) if worker_count is None: worker_count = node_pool_size self._logger.info("Creating session %s:%s with %d workers", owner, name, worker_count) # We can only ever have as many workers as there are processors in the cluster if worker_count > node_pool_size: self._logger.warning("Session %s: requested worker count %d will be capped to %d", name, worker_count, node_pool_size) worker_count = node_pool_size workers = [] while len(workers) < worker_count and (len(node_pool) > 0): results = node_pool.spawn_workers(worker_count - len(workers), init=init) for nproc, result in results: try: worker_batch = result.get() workers.extend(worker_batch) except Exception as ex: self._logger.error("Session %s: failed to spawn workers on node %s due to error:\n%s", name, nproc, full_traceback(ex)) return workers def shutdown_session(self, name): session = self._sessions.pop(name) owner_sessions = self._sessions_by_owner[session.owner] del owner_sessions[name] # Carry out the shutdown operation in the background Tasklet.spawn(session.shutdown) def node_update(self, node_proc, cpu_count, cpu_usage, ram_total, ram_usage): # Remove the node from the queues if it is already registered if node_proc in self._nodes: node = self._nodes[node_proc] self._dequeue(node) else: # Create a new node info if it doesn't exist yet node = NodeInfo(node_proc, cpu_count) self._nodes[node_proc] = node # Update load based on a simple formula of tenancy and resource usage node.update(cpu_usage + ram_usage, cpu_usage, ram_total, ram_usage) self._logger.debug("Received ping %s", node) # Enqueue the node again self._enqueue(node) def node_info(self): return self._nodes.values() def shutdown(self): """ Initiate cluster wide shutdown """ self._logger.warn("Shutting down cluster") self._culling_timer.kill() for node in self._nodes.values(): self._logger.info("Shutting down node %s", node.proc) retry(lambda: node.proc.shutdown(), logger=self._logger) def _cull_dead_nodes(self): """ Remove the node so that it cannot be included in new sessions. """ while True: dead_nodes = list(self._keepalive_queue[:datetime.now() - timedelta(seconds=self._node_timeout)].values()) dead_node_count = len(dead_nodes) if dead_node_count > 0: self._logger.info("Culling %d nodes that are no longer responding.", dead_node_count) for node_dicts in dead_nodes: for node in node_dicts.values(): self._logger.info("Deleting dead node %s", node.proc) self._delete_node(node) else: self._logger.info("No dead nodes.") runtime.sleep(self._node_timeout) def _delete_node(self, node): del self._nodes[node.proc] self._dequeue(node) def _enqueue(self, node): self._add_to_queue(self._keepalive_queue, node, node.last_ping) self._add_to_queue(self._priority_queue, node, node.load) def _dequeue(self, node): self._delete_from_queue(self._keepalive_queue, node.proc, node.last_ping) self._delete_from_queue(self._priority_queue, node.proc, node.load) @staticmethod def _add_to_queue(queue, node, key): queue.setdefault(key, {})[node.proc] = node @staticmethod def _delete_from_queue(queue, node_id, key): kq_nodes = queue.get(key) del kq_nodes[node_id] if not len(kq_nodes): queue.discard(key)
class MMCPolicy(object): def __init__(self, cache_entries_limit, ghost_entries_limit, trace_size_limit, csv_suffix="_mmc.csv", draw_dump=False): self.full_cache = FastRBTree() self.was_hit = None self.was_ghost_hit = None self.num_hits = 0 self.num_requests = 0 self.cache_entries_limit = cache_entries_limit self.ghost_entries_limit = ghost_entries_limit self.trace_size_limit = trace_size_limit self.trace = collections.deque() self.stack = RBTree() self.ranker = RBTree() self.generation = 0 # During startup, this will act like an LRU. self.startup = True self.EM_period = 50 * int(np.ceil(np.log(trace_size_limit))) self.countdown_to_EM = trace_size_limit // 2 self.tau = [0.5, 0.5] self.theta = [0.5, 0.5] self.acc_tau = [0.0] self.acc_theta = [0.0, 0.0] self.num_in_cache = 0 self.num_in_full_cache = 0 self.csv_suffix = csv_suffix self.draw_dump = draw_dump self.ts_order = [ 'row', 'hit', 'ghost_hit', 'tau', 'theta0', 'theta1', 'Z', 'depth', 'rank' ] self.ts_datapoint = {key: None for key in self.ts_order} self.ts_datapoint['row'] = 0 self.ts_file = open("csv/mmc" + self.csv_suffix, "w") self.ts_writer = csv.writer(self.ts_file) self.ts_writer.writerow(self.ts_order) self.evict_order = ['row', 'depth', 'rank', 'value', 'Z', 'tau'] self.evict_datapoint = {key: None for key in self.evict_order} self.evict_datapoint['row'] = 0 self.evict_file = open("csv/mmc_evict" + self.csv_suffix, "w") self.evict_writer = csv.writer(self.evict_file) self.evict_writer.writerow(self.evict_order) self.purge_order = ['row', 'depth', 'rank', 'value', 'Z'] self.purge_datapoint = {key: None for key in self.purge_order} self.purge_datapoint['row'] = 0 self.purge_file = open("csv/mmc_purge" + self.csv_suffix, "w") self.purge_writer = csv.writer(self.purge_file) self.purge_writer.writerow(self.purge_order) def request(self, page): self.num_requests += 1 self.was_hit = False self.was_ghost_hit = False node = self.get_node(page) if node: self.was_ghost_hit = True if not node.is_evicted: self.num_hits += 1 self.was_hit = True node.hit_count += 1.0 - self.calculate_Z(node.depth, node.rank) else: node = Node(self) node.hit_count = self.tau[1] node.page_key = page self.full_cache[page] = node if not self.was_hit: self.num_in_cache += 1 if not self.was_ghost_hit: self.num_in_full_cache += 1 node.is_evicted = node.is_purged = False record = Record(self, node) self.add_trace_record(record) if len(self.trace) > self.trace_size_limit: popped_record = self.trace.popleft() self.update_tau_and_theta_accs(record, increment=True) self.update_tau_and_theta_accs(popped_record, increment=False) self.refresh_params() popped_record.node.hit_count -= 1.0 - popped_record.Z node.restack() node.rerank() self.countdown_to_EM -= 1 if self.countdown_to_EM == 0: self.EM_algorithm(delta=0.00001) self.countdown_to_EM = self.EM_period self.startup = False if (self.num_in_cache > self.cache_entries_limit or self.num_in_full_cache > self.cache_entries_limit + self.ghost_entries_limit): self.pageout() if self.draw_dump: dump_cache(self, self.csv_suffix) def add_trace_record(self, record): self.ts_datapoint['row'] = self.num_requests if self.was_hit: self.ts_datapoint['hit'] = 1 else: self.ts_datapoint['hit'] = 0 if self.was_ghost_hit: self.ts_datapoint['ghost_hit'] = 1 else: self.ts_datapoint['ghost_hit'] = 0 self.ts_datapoint['tau'] = self.tau[0] self.ts_datapoint['theta0'] = self.theta[0] self.ts_datapoint['theta1'] = self.theta[1] depth = record.depth self.ts_datapoint['depth'] = depth self.ts_datapoint['rank'] = record.node.rank self.ts_datapoint['Z'] = record.Z self.ts_writer.writerow( [self.ts_datapoint[key] for key in self.ts_order]) self.ts_file.flush() self.trace.append(record) def pageout(self): min_node = None min_node_value = None min_ghost = None min_ghost_value = None for depth, node in enumerate(self.stack.values()): node.depth_memo = depth for rank, node in enumerate(self.ranker.values()): node.recompute_expected_value(depth=node.depth_memo, rank=rank) value = node.expected_value if not node.is_evicted: if min_node is None or value < min_node_value: min_node = node min_node_value = value if min_ghost is None or value < min_ghost_value: min_ghost = node min_ghost_value = value if self.num_in_cache > self.cache_entries_limit: self.evict(min_node) if (self.num_in_full_cache > self.cache_entries_limit + self.ghost_entries_limit): self.purge(min_ghost) def EM_algorithm(self, delta): def abs_sum(): return abs(self.tau[0]) + abs(self.theta[0]) + abs(self.theta[1]) before = delta + 4.0 i = 0 # We need to detect if we're in a "nonsense" local optimum. The # algorithm will optimize to the global maximum if we aren't in one of # these cases. if (self.startup or self.tau[0] == 0.0 or self.tau[0] == 1.0 or self.theta[0] == 0.0 or self.theta[0] == 0.0): use_hard_Z = True else: use_hard_Z = False while abs(before - abs_sum()) > delta: before = abs_sum() hard_Z = 0.5 if use_hard_Z and i == 0 else None self.E_step(hard_Z=hard_Z) i += 1 self.M_step() # Since we are rearranging the ranks, it's possible that we can # get into a situation where the ranks shift in a cycle such # that the tau delta is always exeeded. I've only seen this limit # hit when the trace size is very small (e.g. 10). if i > 50: break def E_step(self, hard_Z=None): """Treat self.tau and self.theta as constants.""" for node in self.full_cache.values(): node._hit_count = 0.0 for record in self.trace: if hard_Z is None: if record.node.is_purged: rank = record.node.rank_purge_memo else: rank = record.node.rank record._Z = self.calculate_Z(record.depth, rank) else: record._Z = hard_Z record.node._hit_count += (1.0 - record._Z) new_ranker = RBTree() for node in self.full_cache.values(): node.ranker_key = node.new_ranker_key() new_ranker[node.ranker_key] = node self.ranker = new_ranker def M_step(self): """Treat Record.Z as constant.""" self.acc_tau = [0.0] self.acc_theta = [0.0, 0.0] for record in self.trace: self.update_tau_and_theta_accs(record, increment=True) self.refresh_params() def calculate_Z(self, depth, rank): numerator = (self.tau[0] * self.theta[0] * (1 - self.theta[0])**depth) denominator = (numerator + self.tau[1] * self.theta[1] * (1 - self.theta[1])**rank) try: return float(numerator) / denominator except ZeroDivisionError as err: # This can happen when a node falls off the trace and rank and # depth become greater than the limits. return self.tau[0] def refresh_params(self): R = len(self.trace) self.tau[0] = self.acc_tau[0] / R self.tau[1] = 1.0 - self.tau[0] try: self.theta[0] = ((R * self.tau[0]) / (R * self.tau[0] + self.acc_theta[0])) except ZeroDivisionError: self.theta[0] = 1.0 / len(self.full_cache) try: self.theta[1] = ((R * self.tau[1]) / (R * self.tau[1] + self.acc_theta[1])) except ZeroDivisionError: self.theta[1] = 1.0 / len(self.full_cache) def _update_tau_and_theta_accs(self, Z, depth, rank, increment=True): if increment: self.acc_tau[0] += Z self.acc_theta[0] += Z * depth self.acc_theta[1] += (1.0 - Z) * rank else: self.acc_tau[0] -= Z self.acc_theta[0] -= Z * depth self.acc_theta[1] -= (1.0 - Z) * rank self.acc_theta = [max(0.0, acc) for acc in self.acc_theta] def update_tau_and_theta_accs(self, record, increment=True): depth = record.depth if record.node.is_purged: rank = record.node.rank_purge_memo else: rank = record.node.rank self._update_tau_and_theta_accs(record.Z, depth, rank, increment) def evict(self, node): self.evict_datapoint['row'] += 1 self.evict_datapoint['depth'] = node.depth self.evict_datapoint['rank'] = node.rank self.evict_datapoint['value'] = node.expected_value self.evict_datapoint['Z'] = self.calculate_Z(node.depth, node.rank) self.evict_datapoint['tau'] = self.tau[0] self.evict_writer.writerow( [self.evict_datapoint[key] for key in self.evict_order]) self.evict_file.flush() self.num_in_cache -= 1 node.is_evicted = True def purge(self, node): self.purge_datapoint['row'] += 1 self.purge_datapoint['depth'] = node.depth self.purge_datapoint['rank'] = node.rank self.purge_datapoint['value'] = node.expected_value self.purge_datapoint['Z'] = self.calculate_Z(node.depth, node.rank) self.purge_writer.writerow( [self.purge_datapoint[key] for key in self.purge_order]) self.purge_file.flush() self.num_in_full_cache -= 1 node.purge() @property def cache_list(self): return filter(lambda node: not node.is_evicted, self.full_cache_list) @property def full_cache_list(self): return list(self.full_cache.values()) def hit_rate(self): return float(self.num_hits) / self.num_requests def get_node(self, page): try: node = self.full_cache[page] return node except KeyError: return None
class TDigest(object): def __init__(self, delta=0.01, K=25): self.C = RBTree() self.n = 0 self.delta = delta self.K = K def __add__(self, other_digest): data = list(chain(self.C.values(), other_digest.C.values())) new_digest = TDigest(self.delta, self.K) if len(data) > 0: for c in pyudorandom.items(data): new_digest.update(c.mean, c.count) return new_digest def __len__(self): return len(self.C) def __repr__(self): return """<T-Digest: n=%d, centroids=%d>""" % (self.n, len(self)) def _add_centroid(self, centroid): if centroid.mean not in self.C: self.C.insert(centroid.mean, centroid) else: self.C[centroid.mean].update(centroid.mean, centroid.count) def _compute_centroid_quantile(self, centroid): denom = self.n cumulative_sum = sum( c_i.count for c_i in self.C.value_slice(-float('Inf'), centroid.mean)) return (centroid.count / 2. + cumulative_sum) / denom def _update_centroid(self, centroid, x, w): self.C.pop(centroid.mean) centroid.update(x, w) self._add_centroid(centroid) def _find_closest_centroids(self, x): try: ceil_key = self.C.ceiling_key(x) except KeyError: floor_key = self.C.floor_key(x) return [self.C[floor_key]] try: floor_key = self.C.floor_key(x) except KeyError: ceil_key = self.C.ceiling_key(x) return [self.C[ceil_key]] if abs(floor_key - x) < abs(ceil_key - x): return [self.C[floor_key]] elif abs(floor_key - x) == abs(ceil_key - x) and (ceil_key != floor_key): return [self.C[ceil_key], self.C[floor_key]] else: return [self.C[ceil_key]] def _theshold(self, q): return 4 * self.n * self.delta * q * (1 - q) def update(self, x, w=1): """ Update the t-digest with value x and weight w. """ self.n += w if len(self) == 0: self._add_centroid(Centroid(x, w)) return S = self._find_closest_centroids(x) while len(S) != 0 and w > 0: j = choice(list(range(len(S)))) c_j = S[j] q = self._compute_centroid_quantile(c_j) # This filters the out centroids that do not satisfy the second part # of the definition of S. See original paper by Dunning. if c_j.count + w > self._theshold(q): S.pop(j) continue delta_w = min(self._theshold(q) - c_j.count, w) self._update_centroid(c_j, x, delta_w) w -= delta_w S.pop(j) if w > 0: self._add_centroid(Centroid(x, w)) if len(self) > self.K / self.delta: self.compress() return def batch_update(self, values, w=1): """ Update the t-digest with an iterable of values. This assumes all points have the same weight. """ for x in values: self.update(x, w) self.compress() return def compress(self): T = TDigest(self.delta, self.K) C = list(self.C.values()) for c_i in pyudorandom.items(C): T.update(c_i.mean, c_i.count) self.C = T.C def percentile(self, p): """ Computes the percentile of a specific value in [0,100]. """ if not (0 <= p <= 100): raise ValueError("p must be between 0 and 100, inclusive.") t = 0 p = float(p) / 100. p *= self.n for i, key in enumerate(self.C.keys()): c_i = self.C[key] k = c_i.count if p < t + k: if i == 0: return c_i.mean elif i == len(self) - 1: return c_i.mean else: delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2. return c_i.mean + ((p - t) / k - 0.5) * delta t += k return self.C.max_item()[1].mean def quantile(self, q): """ Computes the quantile of a specific value, ie. computes F(q) where F denotes the CDF of the distribution. """ t = 0 N = float(self.n) for i, key in enumerate(self.C.keys()): c_i = self.C[key] if i == len(self) - 1: delta = (c_i.mean - self.C.prev_item(key)[1].mean) / 2. else: delta = (self.C.succ_item(key)[1].mean - c_i.mean) / 2. z = max(-1, (q - c_i.mean) / delta) if z < 1: return t / N + c_i.count / N * (z + 1) / 2 t += c_i.count return 1 def trimmed_mean(self, p1, p2): """ Computes the mean of the distribution between the two percentiles p1 and p2. This is a modified algorithm than the one presented in the original t-Digest paper. """ if not (p1 < p2): raise ValueError("p1 must be between 0 and 100 and less than p2.") s = k = t = 0 p1 /= 100. p2 /= 100. p1 *= self.n p2 *= self.n for i, key in enumerate(self.C.keys()): c_i = self.C[key] k_i = c_i.count if p1 < t + k_i: if i == 0: delta = self.C.succ_item(key)[1].mean - c_i.mean elif i == len(self) - 1: delta = c_i.mean - self.C.prev_item(key)[1].mean else: delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2. nu = ((p1 - t) / k_i - 0.5) * delta s += nu * k_i * c_i.mean k += nu * k_i if p2 < t + k_i: return s / k t += k_i return s / k