Пример #1
0
class MMCRWPolicy(object):
    def __init__(self,
                 cache_entries_limit,
                 ghost_entries_limit,
                 trace_size_limit,
                 csv_suffix="_mmc.csv"):
        self.full_cache = FastRBTree()
        self.was_hit = None
        self.was_ghost_hit = None
        self.num_hits = 0
        self.num_requests = 0
        self.cache_entries_limit = cache_entries_limit
        self.ghost_entries_limit = ghost_entries_limit
        self.trace_size_limit = trace_size_limit
        self.trace = collections.deque()
        self.stack = RBTree()
        self.ranker = RBTree()
        self.generation = 0
        # During startup, this will act like an LRU.
        self.startup = True
        self.EM_period = 50 * int(np.ceil(np.log(trace_size_limit)))
        self.countdown_to_EM = trace_size_limit // 2
        self.tau = [0.25, 0.25, 0.25, 0.25]
        self.theta = [0.5, 0.5, 0.5, 0.5]
        self.acc_tau = [0.0, 0.0, 0.0, 0.0]
        self.acc_theta = [0.0, 0.0, 0.0, 0.0]
        self.num_in_cache = 0
        self.num_in_full_cache = 0
        self.num_reads = 0
        self.csv_suffix = csv_suffix

        self.ts_order = [
            'row', 'hit', 'ghost_hit', 'tau_R_SDD', 'tau_R_IRM', 'tau_W_SDD',
            'tau_W_IRM', 'theta_R_SDD', 'theta_R_IRM', 'theta_W_SDD',
            'theta_W_IRM', 'depth', 'rank', 'Z_R_SDD', 'Z_R_IRM', 'Z_W_SDD',
            'Z_W_IRM', 'Z_sum'
        ]
        self.ts_datapoint = {key: None for key in self.ts_order}
        self.ts_datapoint['row'] = 0
        self.ts_file = open("csv/mmc_rw" + self.csv_suffix, "w")
        self.ts_writer = csv.writer(self.ts_file)
        self.ts_writer.writerow(self.ts_order)

        self.evict_order = ['row', 'depth', 'rank', 'value', 'opcode']
        self.evict_datapoint = {key: None for key in self.evict_order}
        self.evict_datapoint['row'] = 0
        self.evict_file = open("csv/mmc_rw_evict" + self.csv_suffix, "w")
        self.evict_writer = csv.writer(self.evict_file)
        self.evict_writer.writerow(self.evict_order)

        self.purge_order = ['row', 'depth', 'rank', 'value', 'opcode']
        self.purge_datapoint = {key: None for key in self.purge_order}
        self.purge_datapoint['row'] = 0
        self.purge_file = open("csv/mmc_rw_purge" + self.csv_suffix, "w")
        self.purge_writer = csv.writer(self.purge_file)
        self.purge_writer.writerow(self.purge_order)

    def request(self, page, opcode):
        self.num_requests += 1
        self.was_hit = False
        self.was_ghost_hit = False
        node = self.get_node(page)
        if node:
            self.was_ghost_hit = True
            if not node.is_evicted:
                self.num_hits += 1
                self.was_hit = True
            Z = self.calculate_Z(node.depth, node.rank, node.opcode)
            node.hit_count += Z[R_IRM] + Z[W_IRM]
        else:
            node = Node(self)
            node.hit_count = self.tau[R_IRM] + self.tau[W_IRM]
            node.page_key = page
            self.full_cache[page] = node

        if not self.was_hit:
            self.num_in_cache += 1

        if not self.was_ghost_hit:
            self.num_in_full_cache += 1
        else:
            if node.opcode == 'r':
                self.num_reads -= 1

        if opcode == 'r':
            self.num_reads += 1

        node.is_evicted = node.is_purged = False
        record = Record(self, node)
        self.add_trace_record(record)
        node.opcode = opcode

        if len(self.trace) > self.trace_size_limit:
            popped_record = self.trace.popleft()
            self.update_tau_and_theta_accs(record, increment=True)
            self.update_tau_and_theta_accs(popped_record, increment=False)
            self.refresh_params()
            popped_record.node.hit_count -= popped_record.Z[R_IRM]
            popped_record.node.hit_count -= popped_record.Z[W_IRM]

        node.restack()
        node.rerank()

        self.countdown_to_EM -= 1
        if self.countdown_to_EM == 0:
            self.EM_algorithm(delta=0.00001)
            self.countdown_to_EM = self.EM_period
            self.startup = False

        if (self.num_in_cache > self.cache_entries_limit
                or self.num_in_full_cache >
                self.cache_entries_limit + self.ghost_entries_limit):
            self.pageout()
        #dump_cache(self, "exp")

    def add_trace_record(self, record):
        self.ts_datapoint['row'] = self.num_requests
        if self.was_hit:
            self.ts_datapoint['hit'] = 1
        else:
            self.ts_datapoint['hit'] = 0

        if self.was_ghost_hit:
            self.ts_datapoint['ghost_hit'] = 1
        else:
            self.ts_datapoint['ghost_hit'] = 0

        self.ts_datapoint['tau_R_SDD'] = self.tau[R_SDD]
        self.ts_datapoint['tau_R_IRM'] = self.tau[R_IRM]
        self.ts_datapoint['tau_W_SDD'] = self.tau[W_SDD]
        self.ts_datapoint['tau_W_IRM'] = self.tau[W_IRM]

        self.ts_datapoint['theta_R_SDD'] = self.theta[R_SDD]
        self.ts_datapoint['theta_R_IRM'] = self.theta[R_IRM]
        self.ts_datapoint['theta_W_SDD'] = self.theta[W_SDD]
        self.ts_datapoint['theta_W_IRM'] = self.theta[W_IRM]

        self.ts_datapoint['Z_R_SDD'] = record.Z[R_SDD]
        self.ts_datapoint['Z_R_IRM'] = record.Z[R_IRM]
        self.ts_datapoint['Z_W_SDD'] = record.Z[W_SDD]
        self.ts_datapoint['Z_W_IRM'] = record.Z[W_IRM]

        self.ts_datapoint['Z_sum'] = sum(record.Z)

        self.ts_datapoint['depth'] = record.depth
        self.ts_datapoint['rank'] = record.node.rank

        self.ts_writer.writerow(
            [self.ts_datapoint[key] for key in self.ts_order])
        self.ts_file.flush()
        self.trace.append(record)

    def pageout(self):
        min_node = None
        min_node_value = None
        min_ghost = None
        min_ghost_value = None

        for depth, node in enumerate(self.stack.values()):
            node.depth_memo = depth

        for rank, node in enumerate(self.ranker.values()):
            node.recompute_expected_value(depth=node.depth_memo, rank=rank)
            value = node.expected_value
            if not node.is_evicted:
                if min_node is None or value < min_node_value:
                    min_node = node
                    min_node_value = value
            if min_ghost is None or value < min_ghost_value:
                min_ghost = node
                min_ghost_value = value

        if self.num_in_cache > self.cache_entries_limit:
            self.evict(min_node)

        if (self.num_in_full_cache >
                self.cache_entries_limit + self.ghost_entries_limit):
            self.purge(min_ghost)

    def EM_algorithm(self, delta):
        def abs_sum():
            return sum(self.tau) + sum(self.theta)

        before = delta + 4.0
        i = 0
        # We need to detect if we're in a "nonsense" local optimum. The
        # algorithm will optimize to the global maximum if we aren't in one of
        # these cases.
        if (self.startup or min(self.tau) < 0.00001
                or min(self.theta) < 0.00001):
            use_hard_Z = True
        else:
            use_hard_Z = False

        while abs(before - abs_sum()) > delta:
            before = abs_sum()
            hard_Z = [0.25, 0.25, 0.25, 0.25
                      ] if use_hard_Z and i == 0 else None
            self.E_step(hard_Z=hard_Z)
            i += 1
            self.M_step()
            # Since we are rearranging the ranks, it's possible that we can
            # get into a situation where the ranks shift in a cycle such
            # that the tau delta is always exeeded. I've only seen this limit
            # hit when the trace size is very small (e.g. 10).
            if i > 50:
                break

    def E_step(self, hard_Z=None):
        """Treat self.tau and self.theta as constants."""
        for node in self.full_cache.values():
            node._hit_count = 0.0

        for record in self.trace:
            if hard_Z is None:
                if record.node.is_purged:
                    rank = record.node.rank_purge_memo
                else:
                    rank = record.node.rank
                record._Z = self.calculate_Z(record.depth, rank, record.opcode)
            else:
                record._Z = hard_Z
            record.node._hit_count += record._Z[R_IRM] + record._Z[W_IRM]

        new_ranker = RBTree()
        for node in self.full_cache.values():
            node.ranker_key = node.new_ranker_key()
            new_ranker[node.ranker_key] = node
        self.ranker = new_ranker

    def M_step(self):
        """Treat Record.Z as constant."""
        self.acc_tau = [0.0 for d in range(D)]
        self.acc_theta = [0.0 for d in range(D)]
        for record in self.trace:
            self.update_tau_and_theta_accs(record, increment=True)
        self.refresh_params()

    def calculate_Z(self, depth, rank, opcode):
        Z = [0.0 for d in range(D)]
        H = [depth, rank, depth, rank]

        def num_on_hit(i):
            return (self.tau[i] * self.theta[i] * (1 - self.theta[i])**H[i])

        def den_on_hit(i, j):
            acc = 0.0
            for x in [i, j]:
                acc += num_on_hit(x)
            return acc

        if opcode is None:
            num = [0.0 for d in range(D)]
            for i in range(D):
                num[i] = num_on_hit(i)
            den = sum(num)
            return [n / den for n in num]
        elif opcode is 'r':
            num = [num_on_hit(R_SDD), num_on_hit(R_IRM)]
            den = den_on_hit(R_SDD, R_IRM)
            try:
                return [num[0] / den, num[1] / den, 0.0, 0.0]
            except ZeroDivisionError:
                return [0.5, 0.5, 0.0, 0.0]
        elif opcode is 'w':
            num = [num_on_hit(W_SDD), num_on_hit(W_IRM)]
            den = den_on_hit(W_SDD, W_IRM)
            try:
                return [0.0, 0.0, num[0] / den, num[1] / den]
            except ZeroDivisionError:
                return [0.0, 0.0, 0.5, 0.5]

    def refresh_params(self):
        R = len(self.trace)
        self.tau = [self.acc_tau[d] / R for d in range(D)]
        self.theta = [0.0, 0.0, 0.0, 0.0]
        for d in range(D):
            try:
                self.theta[d] = (R * self.tau[d] /
                                 (R * self.tau[d] + self.acc_theta[d]))
            except ZeroDivisionError as err:
                pass

    def _update_tau_and_theta_accs(self, Z, depth, rank, increment=True):
        H = [depth, rank, depth, rank]
        if increment:
            self.acc_tau = [self.acc_tau[d] + Z[d] for d in range(D)]
            self.acc_theta = [
                self.acc_theta[d] + Z[d] * H[d] for d in range(D)
            ]
        else:
            self.acc_tau = [self.acc_tau[d] - Z[d] for d in range(D)]
            self.acc_theta = [
                max(0.0, self.acc_theta[d] - Z[d] * H[d]) for d in range(D)
            ]

    def update_tau_and_theta_accs(self, record, increment=True):
        if record.node.is_purged:
            rank = record.node.rank_purge_memo
        else:
            rank = record.node.rank

        self._update_tau_and_theta_accs(record.Z, record.depth, rank,
                                        increment)

    def evict(self, node):
        self.evict_datapoint['row'] += 1
        self.evict_datapoint['depth'] = node.depth
        self.evict_datapoint['rank'] = node.rank
        self.evict_datapoint['value'] = node.expected_value
        self.evict_datapoint['opcode'] = node.opcode
        self.evict_writer.writerow(
            [self.evict_datapoint[key] for key in self.evict_order])
        self.evict_file.flush()
        self.num_in_cache -= 1
        node.is_evicted = True

    def purge(self, node):
        self.purge_datapoint['row'] += 1
        self.purge_datapoint['depth'] = node.depth
        self.purge_datapoint['rank'] = node.rank
        self.purge_datapoint['value'] = node.expected_value
        self.purge_datapoint['opcode'] = node.opcode
        self.purge_writer.writerow(
            [self.purge_datapoint[key] for key in self.purge_order])
        self.purge_file.flush()
        self.num_in_full_cache -= 1
        if node.opcode == 'r':
            self.num_reads -= 1
        node.purge()

    @property
    def cache_list(self):
        return filter(lambda node: not node.is_evicted, self.full_cache_list)

    @property
    def full_cache_list(self):
        return list(self.full_cache.values())

    def hit_rate(self):
        return float(self.num_hits) / self.num_requests

    def get_node(self, page):
        try:
            node = self.full_cache[page]
            return node
        except KeyError:
            return None
Пример #2
0
class Tdigest(object):

    def __init__(self, delta=0.01, K=25, CX=1.1):
        self.delta = delta
        self.K = K
        self.CX = CX
        self.centroids = RBTree()
        self.nreset = 0
        self.reset()

    def reset(self):
        self.centroids.clear()
        self.n = 0
        self.nreset += 1
        self.last_cumulate = 0
        self.compressing = False

    def push(self, x, n=1):
        if not isinstance(x, list):
            x = [x]
        for item in x:
            self._digest(item, n)

    def percentile(self, p):
        if self.size() == 0:
            return None
        self._cumulate(True)
        cumn = self.n * p
        lower = self.centroids.min_item()[1]
        upper = self.centroids.max_item()[1]
        for c in self.centroids.values():
            if c.cumn <= cumn:
                lower = c
            else:
                upper = c
                break
        if lower == upper:
            return lower.mean
        return lower.mean + (cumn - lower.cumn) * (upper.mean - lower.mean) / \
            (upper.cumn - lower.cumn)

    def serialize(self):
        result = '%s~%s~%s~' % (self.delta, self.K, self.size())
        if self.size() == 0:
            return result
        self._cumulate(True)
        means = []
        counts = []
        for c in self.centroids.values():
            means.append(str(c.mean))
            counts.append(str(c.n))
        return '%s%s~%s' % (result, '~'.join(means), '~'.join(counts))

    @classmethod
    def deserialize(cls, serialized_str):
        if not isinstance(serialized_str, basestring):
            raise Exception(u'serialized_str must be str')
        data = serialized_str.split('~')
        t = Tdigest(delta=float(data[0]), K=int(data[1]))
        size = int(data[2])
        for i in xrange(size):
            t.push(float(data[i + 3]), int(data[size + i + 3]))
        t._cumulate(True)
        return t

    def _digest(self, x, n):
        if self.size() == 0:
            self._new_centroid(x, n, 0)
        else:
            _min = self.centroids.min_item()[1]
            _max = self.centroids.max_item()[1]
            nearest = self.find_nearest(x)
            if nearest and nearest.mean == x:
                self._addweight(nearest, x, n)
            elif nearest == _min:
                self._new_centroid(x, n, 0)
            elif nearest == _max:
                self._new_centroid(x, n, self.n)
            else:
                p = (nearest.cumn + nearest.n / 2.0) / self.n
                max_n = int(4 * self.n * self.delta * p * (1 - p))
                if max_n >= nearest.n + n:
                    self._addweight(nearest, x, n)
                else:
                    self._new_centroid(x, n, nearest.cumn)
        self._cumulate(False)
        if self.K and self.size() > self.K / self.delta:
            self.compress()

    def find_nearest(self, x):
        if self.size() == 0:
            return None
        try:
            lower = self.centroids.ceiling_item(x)[1]
        except KeyError:
            lower = None

        if lower and lower.mean == x:
            return lower

        try:
            prev = self.centroids.floor_item(x)[1]
        except KeyError:
            prev = None

        if not lower:
            return prev
        if not prev:
            return lower
        if abs(prev.mean - x) < abs(lower.mean - x):
            return prev
        else:
            return lower

    def size(self):
        return len(self.centroids)

    def compress(self):
        if self.compressing:
            return
        points = self.toList()
        self.reset()
        self.compressing = True
        for point in sorted(points, key=lambda x: random()):
            self.push(point['mean'], point['n'])
        self._cumulate(True)
        self.compressing = False

    def _cumulate(self, exact):
        if self.n == self.last_cumulate:
            return
        if not exact and self.CX and self.last_cumulate and \
                self.CX > (self.n / self.last_cumulate):
            return
        cumn = 0
        for c in self.centroids.values():
            cumn = c.cumn = cumn + c.n
        self.n = self.last_cumulate = cumn

    def toList(self):
        return [dict(mean=c.mean, n=c.n, cumn=c.cumn) for
                c in self.centroids.values()]

    def _addweight(self, nearest, x, n):
        if x != nearest.mean:
            nearest.mean += n * (x - nearest.mean) / (nearest.n + n)
        nearest.cumn += n
        nearest.n += n
        self.n += n

    def _new_centroid(self, x, n, cumn):
        c = Centroid(x, n, cumn)
        self.centroids.insert(x, c)
        self.n += n
        return c
Пример #3
0
class MMCPolicy(object):
    def __init__(self, cache_entries_limit, ghost_entries_limit,
                 trace_size_limit, csv_suffix="_mmc.csv", draw_dump=False):
        self.full_cache = FastRBTree()
        self.was_hit = None
        self.was_ghost_hit = None
        self.num_hits = 0
        self.num_requests = 0
        self.cache_entries_limit = cache_entries_limit
        self.ghost_entries_limit = ghost_entries_limit
        self.trace_size_limit = trace_size_limit
        self.trace = collections.deque()
        self.stack = RBTree()
        self.ranker = RBTree()
        self.generation = 0
        # During startup, this will act like an LRU.
        self.startup = True
        self.EM_period = 50 * int(np.ceil(np.log(trace_size_limit)))
        self.countdown_to_EM = trace_size_limit // 2
        self.tau = [0.5, 0.5]
        self.theta = [0.5, 0.5]
        self.acc_tau = [0.0]
        self.acc_theta = [0.0, 0.0]
        self.num_in_cache = 0
        self.num_in_full_cache = 0
        self.csv_suffix = csv_suffix
        self.draw_dump = draw_dump

        self.ts_order = [
                'row', 'hit', 'ghost_hit', 'tau',
                'theta0', 'theta1', 'Z', 'depth', 'rank']
        self.ts_datapoint = {key: None for key in self.ts_order}
        self.ts_datapoint['row'] = 0
        self.ts_file = open("csv/mmc" + self.csv_suffix, "w")
        self.ts_writer = csv.writer(self.ts_file)
        self.ts_writer.writerow(self.ts_order)

        self.evict_order = [
                'row', 'depth', 'rank', 'value', 'Z', 'tau']
        self.evict_datapoint = {key: None for key in self.evict_order}
        self.evict_datapoint['row'] = 0
        self.evict_file = open("csv/mmc_evict" + self.csv_suffix, "w")
        self.evict_writer = csv.writer(self.evict_file)
        self.evict_writer.writerow(self.evict_order)

        self.purge_order = ['row', 'depth', 'rank', 'value', 'Z']
        self.purge_datapoint = {key: None for key in self.purge_order}
        self.purge_datapoint['row'] = 0
        self.purge_file = open("csv/mmc_purge" + self.csv_suffix, "w")
        self.purge_writer = csv.writer(self.purge_file)
        self.purge_writer.writerow(self.purge_order)

    def request(self, page):
        self.num_requests += 1
        self.was_hit = False
        self.was_ghost_hit = False
        node = self.get_node(page)
        if node:
            self.was_ghost_hit = True
            if not node.is_evicted:
                self.num_hits += 1
                self.was_hit = True
            node.hit_count += 1.0 - self.calculate_Z(node.depth, node.rank)
        else:
            node = Node(self)
            node.hit_count = self.tau[1]
            node.page_key = page
            self.full_cache[page] = node

        if not self.was_hit:
            self.num_in_cache += 1
        if not self.was_ghost_hit:
            self.num_in_full_cache += 1

        node.is_evicted = node.is_purged = False
        record = Record(self, node)
        self.add_trace_record(record)

        if len(self.trace) > self.trace_size_limit:
            popped_record = self.trace.popleft()
            self.update_tau_and_theta_accs(record, increment=True)
            self.update_tau_and_theta_accs(popped_record, increment=False)
            self.refresh_params()
            popped_record.node.hit_count -= 1.0 - popped_record.Z

        node.restack()
        node.rerank()

        self.countdown_to_EM -= 1
        if self.countdown_to_EM == 0:
            self.EM_algorithm(delta=0.00001)
            self.countdown_to_EM = self.EM_period
            self.startup = False

        if (
          self.num_in_cache > self.cache_entries_limit or
          self.num_in_full_cache >
          self.cache_entries_limit + self.ghost_entries_limit
        ):
            self.pageout()
        if self.draw_dump:
            dump_cache(self, self.csv_suffix)

    def add_trace_record(self, record):
        self.ts_datapoint['row'] = self.num_requests
        if self.was_hit:
            self.ts_datapoint['hit'] = 1
        else:
            self.ts_datapoint['hit'] = 0

        if self.was_ghost_hit:
            self.ts_datapoint['ghost_hit'] = 1
        else:
            self.ts_datapoint['ghost_hit'] = 0

        self.ts_datapoint['tau'] = self.tau[0]
        self.ts_datapoint['theta0'] = self.theta[0]
        self.ts_datapoint['theta1'] = self.theta[1]
        depth = record.depth
        self.ts_datapoint['depth'] = depth
        self.ts_datapoint['rank'] = record.node.rank
        self.ts_datapoint['Z'] = record.Z
        self.ts_writer.writerow(
                [self.ts_datapoint[key] for key in self.ts_order])
        self.ts_file.flush()
        self.trace.append(record)

    def pageout(self):
        min_node = None
        min_node_value = None
        min_ghost = None
        min_ghost_value = None

        for depth, node in enumerate(self.stack.values()):
            node.depth_memo = depth

        for rank, node in enumerate(self.ranker.values()):
            node.recompute_expected_value(depth=node.depth_memo, rank=rank)
            value = node.expected_value
            if not node.is_evicted:
                if min_node is None or value < min_node_value:
                    min_node = node
                    min_node_value = value
            if min_ghost is None or value < min_ghost_value:
                min_ghost = node
                min_ghost_value = value

        if self.num_in_cache > self.cache_entries_limit:
            self.evict(min_node)

        if (
          self.num_in_full_cache >
          self.cache_entries_limit + self.ghost_entries_limit
        ):
            self.purge(min_ghost)

    def EM_algorithm(self, delta):
        def abs_sum():
            return abs(self.tau[0]) + abs(self.theta[0]) + abs(self.theta[1])
        before = delta + 4.0
        i = 0
        # We need to detect if we're in a "nonsense" local optimum. The
        # algorithm will optimize to the global maximum if we aren't in one of
        # these cases.
        if (self.startup or
            self.tau[0] == 0.0 or
            self.tau[0] == 1.0 or
            self.theta[0] == 0.0 or
            self.theta[0] == 0.0
        ):
            use_hard_Z = True
        else:
            use_hard_Z = False

        while abs(before - abs_sum()) > delta:
            before = abs_sum()
            hard_Z = 0.5 if use_hard_Z and i == 0 else None
            self.E_step(hard_Z=hard_Z)
            i += 1
            self.M_step()
            # Since we are rearranging the ranks, it's possible that we can
            # get into a situation where the ranks shift in a cycle such
            # that the tau delta is always exeeded. I've only seen this limit
            # hit when the trace size is very small (e.g. 10).
            if i > 50:
                break

    def E_step(self, hard_Z=None):
        """Treat self.tau and self.theta as constants."""
        for node in self.full_cache.values():
            node._hit_count = 0.0

        for record in self.trace:
            if hard_Z is None:
                if record.node.is_purged:
                    rank = record.node.rank_purge_memo
                else:
                    rank = record.node.rank
                record._Z = self.calculate_Z(record.depth, rank)
            else:
                record._Z = hard_Z
            record.node._hit_count += (1.0 - record._Z)

        new_ranker = RBTree()
        for node in self.full_cache.values():
            node.ranker_key = node.new_ranker_key()
            new_ranker[node.ranker_key] = node
        self.ranker = new_ranker

    def M_step(self):
        """Treat Record.Z as constant."""
        self.acc_tau = [0.0]
        self.acc_theta = [0.0, 0.0]
        for record in self.trace:
            self.update_tau_and_theta_accs(record, increment=True)
        self.refresh_params()

    def calculate_Z(self, depth, rank):
        numerator = (
                self.tau[0] *
                self.theta[0] *
                (1 - self.theta[0])**depth)
        denominator = (
                numerator +
                self.tau[1] *
                self.theta[1] *
                (1 - self.theta[1])**rank)
        try:
            return float(numerator) / denominator
        except ZeroDivisionError as err:
            # This can happen when a node falls off the trace and rank and
            # depth become greater than the limits.
            return self.tau[0]

    def refresh_params(self):
        R = len(self.trace)
        self.tau[0] = self.acc_tau[0] / R
        self.tau[1] = 1.0 - self.tau[0]

        try:
            self.theta[0] = ((R * self.tau[0]) /
                             (R * self.tau[0] + self.acc_theta[0]))
        except ZeroDivisionError:
            self.theta[0] = 1.0 / len(self.full_cache)

        try:
            self.theta[1] = ((R * self.tau[1]) /
                             (R * self.tau[1] + self.acc_theta[1]))
        except ZeroDivisionError:
            self.theta[1] = 1.0 / len(self.full_cache)

    def _update_tau_and_theta_accs(self, Z, depth, rank, increment=True):
        if increment:
            self.acc_tau[0] += Z
            self.acc_theta[0] += Z * depth
            self.acc_theta[1] += (1.0 - Z) * rank
        else:
            self.acc_tau[0] -= Z
            self.acc_theta[0] -= Z * depth
            self.acc_theta[1] -= (1.0 - Z) * rank
            self.acc_theta = [max(0.0, acc) for acc in self.acc_theta]

    def update_tau_and_theta_accs(self, record, increment=True):
        depth = record.depth

        if record.node.is_purged:
            rank = record.node.rank_purge_memo
        else:
            rank = record.node.rank

        self._update_tau_and_theta_accs(record.Z, depth, rank, increment)

    def evict(self, node):
        self.evict_datapoint['row'] += 1
        self.evict_datapoint['depth'] = node.depth
        self.evict_datapoint['rank'] = node.rank
        self.evict_datapoint['value'] = node.expected_value
        self.evict_datapoint['Z'] = self.calculate_Z(
                node.depth, node.rank)
        self.evict_datapoint['tau'] = self.tau[0]
        self.evict_writer.writerow(
                [self.evict_datapoint[key] for key in self.evict_order])
        self.evict_file.flush()
        self.num_in_cache -= 1
        node.is_evicted = True

    def purge(self, node):
        self.purge_datapoint['row'] += 1
        self.purge_datapoint['depth'] = node.depth
        self.purge_datapoint['rank'] = node.rank
        self.purge_datapoint['value'] = node.expected_value
        self.purge_datapoint['Z'] = self.calculate_Z(
                node.depth, node.rank)
        self.purge_writer.writerow(
                [self.purge_datapoint[key] for key in self.purge_order])
        self.purge_file.flush()
        self.num_in_full_cache -= 1
        node.purge()

    @property
    def cache_list(self):
        return filter(lambda node: not node.is_evicted, self.full_cache_list)

    @property
    def full_cache_list(self):
        return list(self.full_cache.values())

    def hit_rate(self):
        return float(self.num_hits) / self.num_requests

    def get_node(self, page):
        try:
            node = self.full_cache[page]
            return node
        except KeyError:
            return None
Пример #4
0
class TDigest(object):

    def __init__(self, delta=0.01, K=25):
        self.C = RBTree()
        self.n = 0
        self.delta = delta
        self.K = K

    def __add__(self, other_digest):
        C1 = list(self.C.values())
        C2 = list(other_digest.C.values())
        shuffle(C1)
        shuffle(C2)
        data = C1 + C2
        new_digest = TDigest(self.delta, self.K)
        for c in data:
            new_digest.update(c.mean, c.count)

        return new_digest

    def __len__(self):
        return len(self.C)

    def __repr__(self):
        return """<T-Digest: n=%d, centroids=%d>""" % (self.n, len(self))

    def _add_centroid(self, centroid):
        if centroid.mean not in self.C:
            self.C.insert(centroid.mean, centroid)
        else:
            self.C[centroid.mean].update(centroid.mean, centroid.count)

    def _compute_centroid_quantile(self, centroid):
        denom = self.n
        cumulative_sum = sum(
            c_i.count for c_i in self.C.value_slice(-float('Inf'), centroid.mean))
        return (centroid.count / 2. + cumulative_sum) / denom

    def _update_centroid(self, centroid, x, w):
        self.C.pop(centroid.mean)
        centroid.update(x, w)
        self._add_centroid(centroid)

    def _find_closest_centroids(self, x):
        try:
            ceil_key = self.C.ceiling_key(x)
        except KeyError:
            floor_key = self.C.floor_key(x)
            return [self.C[floor_key]]

        try:
            floor_key = self.C.floor_key(x)
        except KeyError:
            ceil_key = self.C.ceiling_key(x)
            return [self.C[ceil_key]]

        if abs(floor_key - x) < abs(ceil_key - x):
            return [self.C[floor_key]]
        elif abs(floor_key - x) == abs(ceil_key - x) and (ceil_key != floor_key):
            return [self.C[ceil_key], self.C[floor_key]]
        else:
            return [self.C[ceil_key]]

    def _theshold(self, q):
        return 4 * self.n * self.delta * q * (1 - q)

    def update(self, x, w=1):
        """
        Update the t-digest with value x and weight w.

        """
        self.n += w

        if len(self) == 0:
            self._add_centroid(Centroid(x, w))
            return

        S = self._find_closest_centroids(x)

        while len(S) != 0 and w > 0:
            j = choice(list(range(len(S))))
            c_j = S[j]

            q = self._compute_centroid_quantile(c_j)

            # This filters the out centroids that do not satisfy the second part
            # of the definition of S. See original paper by Dunning. 
            if c_j.count + w > self._theshold(q):
                S.pop(j)
                continue

            delta_w = min(self._theshold(q) - c_j.count, w)
            self._update_centroid(c_j, x, delta_w)
            w -= delta_w
            S.pop(j)

        if w > 0:
            self._add_centroid(Centroid(x, w))

        if len(self) > self.K / self.delta:
            self.compress()

        return

    def batch_update(self, values, w=1):
        """
        Update the t-digest with an iterable of values. This assumes all points have the 
        same weight.
        """
        for x in values:
            self.update(x, w)
        self.compress()
        return

    def compress(self):
        T = TDigest(self.delta, self.K)
        C = list(self.C.values())
        shuffle(C)
        for c_i in C:
            T.update(c_i.mean, c_i.count)
        self.C = T.C

    def percentile(self, q):
        """ 
        Computes the percentile of a specific value in [0,1], ie. computes F^{-1}(q) where F^{-1} denotes
        the inverse CDF of the distribution. 

        """
        if not (0 <= q <= 1):
            raise ValueError("q must be between 0 and 1, inclusive.")

        t = 0
        q *= self.n

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k = c_i.count
            if q < t + k:
                if i == 0:
                    return c_i.mean
                elif i == len(self) - 1:
                    return c_i.mean
                else:
                    delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2.
                return c_i.mean + ((q - t) / k - 0.5) * delta

            t += k
        return self.C.max_item()[1].mean

    def quantile(self, q):
        """ 
        Computes the quantile of a specific value, ie. computes F(q) where F denotes
        the CDF of the distribution. 

        """
        t = 0
        N = float(self.n)

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            if i == len(self) - 1:
                delta = (c_i.mean - self.C.prev_item(key)[1].mean) / 2.
            else:
                delta = (self.C.succ_item(key)[1].mean - c_i.mean) / 2.
            z = max(-1, (q - c_i.mean) / delta)

            if z < 1:
                return t / N + c_i.count / N * (z + 1) / 2

            t += c_i.count
        return 1

    def trimmed_mean(self, q1, q2):
        """
        Computes the mean of the distribution between the two percentiles q1 and q2.
        This is a modified algorithm than the one presented in the original t-Digest paper. 

        """
        if not (q1 < q2):
            raise ValueError("q must be between 0 and 1, inclusive.")

        s = k = t = 0
        q1 *= self.n
        q2 *= self.n
        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k_i = c_i.count
            if q1 < t + k_i:
                if i == 0:
                    delta = self.C.succ_item(key)[1].mean - c_i.mean
                elif i == len(self) - 1:
                    delta = c_i.mean - self.C.prev_item(key)[1].mean
                else:
                    delta = (self.C.succ_item(key)[1].mean - self.C.prev_item(key)[1].mean) / 2.
                nu = ((q1 - t) / k_i - 0.5) * delta
                s += nu * k_i * c_i.mean
                k += nu * k_i

            if q2 < t + k_i:
                return s/k
            t += k_i

        return s/k
Пример #5
0
class MMCRWPolicy(object):
    def __init__(self, cache_entries_limit, ghost_entries_limit,
                 trace_size_limit, csv_suffix="_mmc.csv"):
        self.full_cache = FastRBTree()
        self.was_hit = None
        self.was_ghost_hit = None
        self.num_hits = 0
        self.num_requests = 0
        self.cache_entries_limit = cache_entries_limit
        self.ghost_entries_limit = ghost_entries_limit
        self.trace_size_limit = trace_size_limit
        self.trace = collections.deque()
        self.stack = RBTree()
        self.ranker = RBTree()
        self.generation = 0
        # During startup, this will act like an LRU.
        self.startup = True
        self.EM_period = 50 * int(np.ceil(np.log(trace_size_limit)))
        self.countdown_to_EM = trace_size_limit // 2
        self.tau = [0.25, 0.25, 0.25, 0.25]
        self.theta = [0.5, 0.5, 0.5, 0.5]
        self.acc_tau = [0.0, 0.0, 0.0, 0.0]
        self.acc_theta = [0.0, 0.0, 0.0, 0.0]
        self.num_in_cache = 0
        self.num_in_full_cache = 0
        self.num_reads = 0
        self.csv_suffix = csv_suffix

        self.ts_order = [
                'row', 'hit', 'ghost_hit',
                'tau_R_SDD', 'tau_R_IRM', 'tau_W_SDD', 'tau_W_IRM',
                'theta_R_SDD', 'theta_R_IRM', 'theta_W_SDD', 'theta_W_IRM',
                'depth', 'rank',
                'Z_R_SDD', 'Z_R_IRM', 'Z_W_SDD', 'Z_W_IRM', 'Z_sum'
            ]
        self.ts_datapoint = {key: None for key in self.ts_order}
        self.ts_datapoint['row'] = 0
        self.ts_file = open("csv/mmc_rw" + self.csv_suffix, "w")
        self.ts_writer = csv.writer(self.ts_file)
        self.ts_writer.writerow(self.ts_order)

        self.evict_order = [
                'row', 'depth', 'rank', 'value', 'opcode']
        self.evict_datapoint = {key: None for key in self.evict_order}
        self.evict_datapoint['row'] = 0
        self.evict_file = open("csv/mmc_rw_evict" + self.csv_suffix, "w")
        self.evict_writer = csv.writer(self.evict_file)
        self.evict_writer.writerow(self.evict_order)

        self.purge_order = ['row', 'depth', 'rank', 'value', 'opcode']
        self.purge_datapoint = {key: None for key in self.purge_order}
        self.purge_datapoint['row'] = 0
        self.purge_file = open("csv/mmc_rw_purge" + self.csv_suffix, "w")
        self.purge_writer = csv.writer(self.purge_file)
        self.purge_writer.writerow(self.purge_order)

    def request(self, page, opcode):
        self.num_requests += 1
        self.was_hit = False
        self.was_ghost_hit = False
        node = self.get_node(page)
        if node:
            self.was_ghost_hit = True
            if not node.is_evicted:
                self.num_hits += 1
                self.was_hit = True
            Z = self.calculate_Z(node.depth, node.rank, node.opcode)
            node.hit_count += Z[R_IRM] + Z[W_IRM]
        else:
            node = Node(self)
            node.hit_count = self.tau[R_IRM] + self.tau[W_IRM]
            node.page_key = page
            self.full_cache[page] = node

        if not self.was_hit:
            self.num_in_cache += 1

        if not self.was_ghost_hit:
            self.num_in_full_cache += 1
        else:
            if node.opcode == 'r':
                self.num_reads -= 1

        if opcode == 'r':
            self.num_reads += 1

        node.is_evicted = node.is_purged = False
        record = Record(self, node)
        self.add_trace_record(record)
        node.opcode = opcode

        if len(self.trace) > self.trace_size_limit:
            popped_record = self.trace.popleft()
            self.update_tau_and_theta_accs(record, increment=True)
            self.update_tau_and_theta_accs(popped_record, increment=False)
            self.refresh_params()
            popped_record.node.hit_count -= popped_record.Z[R_IRM]
            popped_record.node.hit_count -= popped_record.Z[W_IRM]

        node.restack()
        node.rerank()

        self.countdown_to_EM -= 1
        if self.countdown_to_EM == 0:
            self.EM_algorithm(delta=0.00001)
            self.countdown_to_EM = self.EM_period
            self.startup = False

        if (
          self.num_in_cache > self.cache_entries_limit or
          self.num_in_full_cache >
          self.cache_entries_limit + self.ghost_entries_limit
        ):
            self.pageout()
        #dump_cache(self, "exp")

    def add_trace_record(self, record):
        self.ts_datapoint['row'] = self.num_requests
        if self.was_hit:
            self.ts_datapoint['hit'] = 1
        else:
            self.ts_datapoint['hit'] = 0

        if self.was_ghost_hit:
            self.ts_datapoint['ghost_hit'] = 1
        else:
            self.ts_datapoint['ghost_hit'] = 0

        self.ts_datapoint['tau_R_SDD'] = self.tau[R_SDD]
        self.ts_datapoint['tau_R_IRM'] = self.tau[R_IRM]
        self.ts_datapoint['tau_W_SDD'] = self.tau[W_SDD]
        self.ts_datapoint['tau_W_IRM'] = self.tau[W_IRM]

        self.ts_datapoint['theta_R_SDD'] = self.theta[R_SDD]
        self.ts_datapoint['theta_R_IRM'] = self.theta[R_IRM]
        self.ts_datapoint['theta_W_SDD'] = self.theta[W_SDD]
        self.ts_datapoint['theta_W_IRM'] = self.theta[W_IRM]

        self.ts_datapoint['Z_R_SDD'] = record.Z[R_SDD]
        self.ts_datapoint['Z_R_IRM'] = record.Z[R_IRM]
        self.ts_datapoint['Z_W_SDD'] = record.Z[W_SDD]
        self.ts_datapoint['Z_W_IRM'] = record.Z[W_IRM]

        self.ts_datapoint['Z_sum'] = sum(record.Z)

        self.ts_datapoint['depth'] = record.depth
        self.ts_datapoint['rank'] = record.node.rank

        self.ts_writer.writerow(
                [self.ts_datapoint[key] for key in self.ts_order])
        self.ts_file.flush()
        self.trace.append(record)

    def pageout(self):
        min_node = None
        min_node_value = None
        min_ghost = None
        min_ghost_value = None

        for depth, node in enumerate(self.stack.values()):
            node.depth_memo = depth

        for rank, node in enumerate(self.ranker.values()):
            node.recompute_expected_value(depth=node.depth_memo, rank=rank)
            value = node.expected_value
            if not node.is_evicted:
                if min_node is None or value < min_node_value:
                    min_node = node
                    min_node_value = value
            if min_ghost is None or value < min_ghost_value:
                min_ghost = node
                min_ghost_value = value

        if self.num_in_cache > self.cache_entries_limit:
            self.evict(min_node)

        if (
          self.num_in_full_cache >
          self.cache_entries_limit + self.ghost_entries_limit
        ):
            self.purge(min_ghost)

    def EM_algorithm(self, delta):
        def abs_sum():
            return sum(self.tau) + sum(self.theta)
        before = delta + 4.0
        i = 0
        # We need to detect if we're in a "nonsense" local optimum. The
        # algorithm will optimize to the global maximum if we aren't in one of
        # these cases.
        if (self.startup or
            min(self.tau) < 0.00001 or
            min(self.theta) < 0.00001
        ):
            use_hard_Z = True
        else:
            use_hard_Z = False

        while abs(before - abs_sum()) > delta:
            before = abs_sum()
            hard_Z = [0.25, 0.25, 0.25, 0.25] if use_hard_Z and i == 0 else None
            self.E_step(hard_Z=hard_Z)
            i += 1
            self.M_step()
            # Since we are rearranging the ranks, it's possible that we can
            # get into a situation where the ranks shift in a cycle such
            # that the tau delta is always exeeded. I've only seen this limit
            # hit when the trace size is very small (e.g. 10).
            if i > 50:
                break

    def E_step(self, hard_Z=None):
        """Treat self.tau and self.theta as constants."""
        for node in self.full_cache.values():
            node._hit_count = 0.0

        for record in self.trace:
            if hard_Z is None:
                if record.node.is_purged:
                    rank = record.node.rank_purge_memo
                else:
                    rank = record.node.rank
                record._Z = self.calculate_Z(record.depth, rank, record.opcode)
            else:
                record._Z = hard_Z
            record.node._hit_count += record._Z[R_IRM] + record._Z[W_IRM]

        new_ranker = RBTree()
        for node in self.full_cache.values():
            node.ranker_key = node.new_ranker_key()
            new_ranker[node.ranker_key] = node
        self.ranker = new_ranker

    def M_step(self):
        """Treat Record.Z as constant."""
        self.acc_tau = [0.0 for d in range(D)]
        self.acc_theta = [0.0 for d in range(D)]
        for record in self.trace:
            self.update_tau_and_theta_accs(record, increment=True)
        self.refresh_params()

    def calculate_Z(self, depth, rank, opcode):
        Z = [0.0 for d in range(D)]
        H = [depth, rank, depth, rank]

        def num_on_hit(i):
            return (self.tau[i] *
                    self.theta[i] *
                    (1 - self.theta[i])**H[i])

        def den_on_hit(i, j):
            acc = 0.0
            for x in [i, j]:
                acc += num_on_hit(x)
            return acc

        if opcode is None:
            num = [0.0 for d in range(D)]
            for i in range(D):
                num[i] = num_on_hit(i)
            den = sum(num)
            return [n / den for n in num]
        elif opcode is 'r':
            num = [num_on_hit(R_SDD), num_on_hit(R_IRM)]
            den = den_on_hit(R_SDD, R_IRM)
            try:
                return [num[0] / den, num[1] / den, 0.0, 0.0]
            except ZeroDivisionError:
                return [0.5, 0.5, 0.0, 0.0]
        elif opcode is 'w':
            num = [num_on_hit(W_SDD), num_on_hit(W_IRM)]
            den = den_on_hit(W_SDD, W_IRM)
            try:
                return [0.0, 0.0, num[0] / den, num[1] / den]
            except ZeroDivisionError:
                return [0.0, 0.0, 0.5, 0.5]

    def refresh_params(self):
        R = len(self.trace)
        self.tau = [self.acc_tau[d] / R for d in range(D)]
        self.theta = [0.0, 0.0, 0.0, 0.0]
        for d in range(D):
            try:
                self.theta[d] = (R * self.tau[d] /
                                 (R * self.tau[d] + self.acc_theta[d]))
            except ZeroDivisionError as err:
                pass

    def _update_tau_and_theta_accs(self, Z, depth, rank, increment=True):
        H = [depth, rank, depth, rank]
        if increment:
            self.acc_tau = [self.acc_tau[d] + Z[d] for d in range(D)]
            self.acc_theta = [self.acc_theta[d] + Z[d] * H[d] for d in range(D)]
        else:
            self.acc_tau = [self.acc_tau[d] - Z[d] for d in range(D)]
            self.acc_theta = [max(0.0, self.acc_theta[d] - Z[d] * H[d])
                              for d in range(D)]

    def update_tau_and_theta_accs(self, record, increment=True):
        if record.node.is_purged:
            rank = record.node.rank_purge_memo
        else:
            rank = record.node.rank

        self._update_tau_and_theta_accs(record.Z, record.depth, rank, increment)

    def evict(self, node):
        self.evict_datapoint['row'] += 1
        self.evict_datapoint['depth'] = node.depth
        self.evict_datapoint['rank'] = node.rank
        self.evict_datapoint['value'] = node.expected_value
        self.evict_datapoint['opcode'] = node.opcode
        self.evict_writer.writerow(
                [self.evict_datapoint[key] for key in self.evict_order])
        self.evict_file.flush()
        self.num_in_cache -= 1
        node.is_evicted = True

    def purge(self, node):
        self.purge_datapoint['row'] += 1
        self.purge_datapoint['depth'] = node.depth
        self.purge_datapoint['rank'] = node.rank
        self.purge_datapoint['value'] = node.expected_value
        self.purge_datapoint['opcode'] = node.opcode
        self.purge_writer.writerow(
                [self.purge_datapoint[key] for key in self.purge_order])
        self.purge_file.flush()
        self.num_in_full_cache -= 1
        if node.opcode == 'r':
            self.num_reads -= 1
        node.purge()

    @property
    def cache_list(self):
        return filter(lambda node: not node.is_evicted, self.full_cache_list)

    @property
    def full_cache_list(self):
        return list(self.full_cache.values())

    def hit_rate(self):
        return float(self.num_hits) / self.num_requests

    def get_node(self, page):
        try:
            node = self.full_cache[page]
            return node
        except KeyError:
            return None
Пример #6
0
class MINPolicy(object):
    def __init__(self, cache_size_limit, trace, csv_suffix=".csv"):
        self.cache_size_limit = cache_size_limit
        self.cache = {}
        self.hits = 0.0
        self.requests = 0.0
        self.ts_order = ['row', 'hit']
        self.ts_datapoint = {key: None for key in self.ts_order}
        self.ts_datapoint['row'] = 0
        self.ts_file = open("csv/min" + csv_suffix, "w")
        self.ts_writer = csv.writer(self.ts_file)
        self.ts_writer.writerow(self.ts_order)

        self.clairvoyance = FastRBTree()

        self.precog = FastRBTree()
        last_time = time.time()
        for i, page_opcode in enumerate(trace):
            if time.time() > last_time + 0.1:
                last_time = time.time()
                print '1', i, '\r',
            sys.stdout.flush()
            page, _ = page_opcode
            try:
                self.precog[page].append(i)
            except KeyError:
                self.precog[page] = collections.deque()
                self.precog[page].append(i)

            known_max = i
        known_max += 2
        for times in self.precog.values():
            times.append(known_max)
            known_max += 1
        print
        print 'Done loading.'

    def hit_rate(self):
        return self.hits / self.requests

    def request(self, page):
        self.requests += 1
        if page in self.cache:
            was_hit = True
            self.hits += 1
        else:
            was_hit = False

        self.cache[page] = self.precog[page].popleft()
        # This happens on startup.
        if self.cache[page] < self.requests:
            self.cache[page] = self.precog[page].popleft()
        self.clairvoyance[self.cache[page]] = page
        self.ts_datapoint['row'] += 1

        if was_hit:
            self.ts_datapoint['hit'] = 1
        else:
            self.ts_datapoint['hit'] = 0

        self.ts_writer.writerow(
                [self.ts_datapoint[key] for key in self.ts_order])
        self.ts_file.flush()

        if len(self.cache) > self.cache_size_limit:
            next_use, page = self.clairvoyance.pop_max()
            del self.cache[page]
Пример #7
0
class Master(object):
    __metaclass__ = ProcessMeta

    def __init__(self, node_timeout):
        self._logger = logging.getLogger(self.__class__.__name__)

        self._nodes = {}
        self._sessions = {}
        self._sessions_by_owner = {}
        self._keepalive_queue = FastRBTree()
        self._priority_queue = FastRBTree()
        self._node_timeout = node_timeout
        self._culling_timer = runtime.greenpool.spawn(self._cull_dead_nodes)

    def get_session(self, name, owner=None, dep_server=None, work_dir=None, worker_count=None, init=None):
        try:
            session = self._sessions[name]
            session.dep_cache.set_dependency_server(dep_server)
            return session
        except KeyError:
            if owner is None:
                raise ValueError("An owner must be provided for new sessions")

            if work_dir is None:
                raise ValueError("Valid working directory required to create a new session")

            if dep_server is None:
                raise ValueError("Dependency server must be provided to create a new session")

            session = Session(name, owner, dep_server, worker_count, self._spawn_workers, work_dir, init)

            self._sessions[name] = session
            self._sessions_by_owner.setdefault(owner, {})[name] = session

            return RemoteCloud(name, owner, session.hub, session.created_on, len(session.workers), self)

    def _spawn_workers(self, name, owner, worker_count, init):
        all_nodes = itertools.imap(lambda nd: nd.itervalues(), self._priority_queue.values())
        all_nodes = itertools.chain.from_iterable(all_nodes)
        node_pool = NodePool(all_nodes, name, self._logger)
        node_pool_size = len(node_pool)
        if worker_count is None:
            worker_count = node_pool_size
        self._logger.info("Creating session %s:%s with %d workers", owner, name, worker_count)
        # We can only ever have as many workers as there are processors in the cluster
        if worker_count > node_pool_size:
            self._logger.warning("Session %s: requested worker count %d will be capped to %d",
                                 name, worker_count, node_pool_size)

            worker_count = node_pool_size

        workers = []

        while len(workers) < worker_count and (len(node_pool) > 0):
            results = node_pool.spawn_workers(worker_count - len(workers), init=init)
            for nproc, result in results:
                try:
                    worker_batch = result.get()
                    workers.extend(worker_batch)
                except Exception as ex:
                    self._logger.error("Session %s: failed to spawn workers on node %s due to error:\n%s",
                                       name, nproc, full_traceback(ex))

        return workers

    def shutdown_session(self, name):
        session = self._sessions.pop(name)
        owner_sessions = self._sessions_by_owner[session.owner]
        del owner_sessions[name]

        # Carry out the shutdown operation in the background
        Tasklet.spawn(session.shutdown)

    def node_update(self, node_proc, cpu_count, cpu_usage, ram_total, ram_usage):
        # Remove the node from the queues if it is already registered
        if node_proc in self._nodes:
            node = self._nodes[node_proc]
            self._dequeue(node)
        else:
            # Create a new node info if it doesn't exist yet
            node = NodeInfo(node_proc, cpu_count)
            self._nodes[node_proc] = node

        # Update load based on a simple formula of tenancy and resource usage
        node.update(cpu_usage + ram_usage, cpu_usage, ram_total, ram_usage)

        self._logger.debug("Received ping %s", node)

        # Enqueue the node again
        self._enqueue(node)

    def node_info(self):
        return self._nodes.values()

    def shutdown(self):
        """
        Initiate cluster wide shutdown
        """
        self._logger.warn("Shutting down cluster")
        self._culling_timer.kill()

        for node in self._nodes.values():
            self._logger.info("Shutting down node %s", node.proc)
            retry(lambda: node.proc.shutdown(), logger=self._logger)

    def _cull_dead_nodes(self):
        """
        Remove the node so that it cannot be included in new sessions.
        """
        while True:
            dead_nodes = list(self._keepalive_queue[:datetime.now() - timedelta(seconds=self._node_timeout)].values())
            dead_node_count = len(dead_nodes)
            if dead_node_count > 0:
                self._logger.info("Culling %d nodes that are no longer responding.", dead_node_count)
                for node_dicts in dead_nodes:
                    for node in node_dicts.values():
                        self._logger.info("Deleting dead node %s", node.proc)
                        self._delete_node(node)
            else:
                self._logger.info("No dead nodes.")

            runtime.sleep(self._node_timeout)

    def _delete_node(self, node):
        del self._nodes[node.proc]

        self._dequeue(node)

    def _enqueue(self, node):
        self._add_to_queue(self._keepalive_queue, node, node.last_ping)
        self._add_to_queue(self._priority_queue, node, node.load)

    def _dequeue(self, node):
        self._delete_from_queue(self._keepalive_queue, node.proc, node.last_ping)
        self._delete_from_queue(self._priority_queue, node.proc, node.load)

    @staticmethod
    def _add_to_queue(queue, node, key):
        queue.setdefault(key, {})[node.proc] = node

    @staticmethod
    def _delete_from_queue(queue, node_id, key):
        kq_nodes = queue.get(key)
        del kq_nodes[node_id]
        if not len(kq_nodes):
            queue.discard(key)
Пример #8
0
class MMCPolicy(object):
    def __init__(self,
                 cache_entries_limit,
                 ghost_entries_limit,
                 trace_size_limit,
                 csv_suffix="_mmc.csv",
                 draw_dump=False):
        self.full_cache = FastRBTree()
        self.was_hit = None
        self.was_ghost_hit = None
        self.num_hits = 0
        self.num_requests = 0
        self.cache_entries_limit = cache_entries_limit
        self.ghost_entries_limit = ghost_entries_limit
        self.trace_size_limit = trace_size_limit
        self.trace = collections.deque()
        self.stack = RBTree()
        self.ranker = RBTree()
        self.generation = 0
        # During startup, this will act like an LRU.
        self.startup = True
        self.EM_period = 50 * int(np.ceil(np.log(trace_size_limit)))
        self.countdown_to_EM = trace_size_limit // 2
        self.tau = [0.5, 0.5]
        self.theta = [0.5, 0.5]
        self.acc_tau = [0.0]
        self.acc_theta = [0.0, 0.0]
        self.num_in_cache = 0
        self.num_in_full_cache = 0
        self.csv_suffix = csv_suffix
        self.draw_dump = draw_dump

        self.ts_order = [
            'row', 'hit', 'ghost_hit', 'tau', 'theta0', 'theta1', 'Z', 'depth',
            'rank'
        ]
        self.ts_datapoint = {key: None for key in self.ts_order}
        self.ts_datapoint['row'] = 0
        self.ts_file = open("csv/mmc" + self.csv_suffix, "w")
        self.ts_writer = csv.writer(self.ts_file)
        self.ts_writer.writerow(self.ts_order)

        self.evict_order = ['row', 'depth', 'rank', 'value', 'Z', 'tau']
        self.evict_datapoint = {key: None for key in self.evict_order}
        self.evict_datapoint['row'] = 0
        self.evict_file = open("csv/mmc_evict" + self.csv_suffix, "w")
        self.evict_writer = csv.writer(self.evict_file)
        self.evict_writer.writerow(self.evict_order)

        self.purge_order = ['row', 'depth', 'rank', 'value', 'Z']
        self.purge_datapoint = {key: None for key in self.purge_order}
        self.purge_datapoint['row'] = 0
        self.purge_file = open("csv/mmc_purge" + self.csv_suffix, "w")
        self.purge_writer = csv.writer(self.purge_file)
        self.purge_writer.writerow(self.purge_order)

    def request(self, page):
        self.num_requests += 1
        self.was_hit = False
        self.was_ghost_hit = False
        node = self.get_node(page)
        if node:
            self.was_ghost_hit = True
            if not node.is_evicted:
                self.num_hits += 1
                self.was_hit = True
            node.hit_count += 1.0 - self.calculate_Z(node.depth, node.rank)
        else:
            node = Node(self)
            node.hit_count = self.tau[1]
            node.page_key = page
            self.full_cache[page] = node

        if not self.was_hit:
            self.num_in_cache += 1
        if not self.was_ghost_hit:
            self.num_in_full_cache += 1

        node.is_evicted = node.is_purged = False
        record = Record(self, node)
        self.add_trace_record(record)

        if len(self.trace) > self.trace_size_limit:
            popped_record = self.trace.popleft()
            self.update_tau_and_theta_accs(record, increment=True)
            self.update_tau_and_theta_accs(popped_record, increment=False)
            self.refresh_params()
            popped_record.node.hit_count -= 1.0 - popped_record.Z

        node.restack()
        node.rerank()

        self.countdown_to_EM -= 1
        if self.countdown_to_EM == 0:
            self.EM_algorithm(delta=0.00001)
            self.countdown_to_EM = self.EM_period
            self.startup = False

        if (self.num_in_cache > self.cache_entries_limit
                or self.num_in_full_cache >
                self.cache_entries_limit + self.ghost_entries_limit):
            self.pageout()
        if self.draw_dump:
            dump_cache(self, self.csv_suffix)

    def add_trace_record(self, record):
        self.ts_datapoint['row'] = self.num_requests
        if self.was_hit:
            self.ts_datapoint['hit'] = 1
        else:
            self.ts_datapoint['hit'] = 0

        if self.was_ghost_hit:
            self.ts_datapoint['ghost_hit'] = 1
        else:
            self.ts_datapoint['ghost_hit'] = 0

        self.ts_datapoint['tau'] = self.tau[0]
        self.ts_datapoint['theta0'] = self.theta[0]
        self.ts_datapoint['theta1'] = self.theta[1]
        depth = record.depth
        self.ts_datapoint['depth'] = depth
        self.ts_datapoint['rank'] = record.node.rank
        self.ts_datapoint['Z'] = record.Z
        self.ts_writer.writerow(
            [self.ts_datapoint[key] for key in self.ts_order])
        self.ts_file.flush()
        self.trace.append(record)

    def pageout(self):
        min_node = None
        min_node_value = None
        min_ghost = None
        min_ghost_value = None

        for depth, node in enumerate(self.stack.values()):
            node.depth_memo = depth

        for rank, node in enumerate(self.ranker.values()):
            node.recompute_expected_value(depth=node.depth_memo, rank=rank)
            value = node.expected_value
            if not node.is_evicted:
                if min_node is None or value < min_node_value:
                    min_node = node
                    min_node_value = value
            if min_ghost is None or value < min_ghost_value:
                min_ghost = node
                min_ghost_value = value

        if self.num_in_cache > self.cache_entries_limit:
            self.evict(min_node)

        if (self.num_in_full_cache >
                self.cache_entries_limit + self.ghost_entries_limit):
            self.purge(min_ghost)

    def EM_algorithm(self, delta):
        def abs_sum():
            return abs(self.tau[0]) + abs(self.theta[0]) + abs(self.theta[1])

        before = delta + 4.0
        i = 0
        # We need to detect if we're in a "nonsense" local optimum. The
        # algorithm will optimize to the global maximum if we aren't in one of
        # these cases.
        if (self.startup or self.tau[0] == 0.0 or self.tau[0] == 1.0
                or self.theta[0] == 0.0 or self.theta[0] == 0.0):
            use_hard_Z = True
        else:
            use_hard_Z = False

        while abs(before - abs_sum()) > delta:
            before = abs_sum()
            hard_Z = 0.5 if use_hard_Z and i == 0 else None
            self.E_step(hard_Z=hard_Z)
            i += 1
            self.M_step()
            # Since we are rearranging the ranks, it's possible that we can
            # get into a situation where the ranks shift in a cycle such
            # that the tau delta is always exeeded. I've only seen this limit
            # hit when the trace size is very small (e.g. 10).
            if i > 50:
                break

    def E_step(self, hard_Z=None):
        """Treat self.tau and self.theta as constants."""
        for node in self.full_cache.values():
            node._hit_count = 0.0

        for record in self.trace:
            if hard_Z is None:
                if record.node.is_purged:
                    rank = record.node.rank_purge_memo
                else:
                    rank = record.node.rank
                record._Z = self.calculate_Z(record.depth, rank)
            else:
                record._Z = hard_Z
            record.node._hit_count += (1.0 - record._Z)

        new_ranker = RBTree()
        for node in self.full_cache.values():
            node.ranker_key = node.new_ranker_key()
            new_ranker[node.ranker_key] = node
        self.ranker = new_ranker

    def M_step(self):
        """Treat Record.Z as constant."""
        self.acc_tau = [0.0]
        self.acc_theta = [0.0, 0.0]
        for record in self.trace:
            self.update_tau_and_theta_accs(record, increment=True)
        self.refresh_params()

    def calculate_Z(self, depth, rank):
        numerator = (self.tau[0] * self.theta[0] * (1 - self.theta[0])**depth)
        denominator = (numerator + self.tau[1] * self.theta[1] *
                       (1 - self.theta[1])**rank)
        try:
            return float(numerator) / denominator
        except ZeroDivisionError as err:
            # This can happen when a node falls off the trace and rank and
            # depth become greater than the limits.
            return self.tau[0]

    def refresh_params(self):
        R = len(self.trace)
        self.tau[0] = self.acc_tau[0] / R
        self.tau[1] = 1.0 - self.tau[0]

        try:
            self.theta[0] = ((R * self.tau[0]) /
                             (R * self.tau[0] + self.acc_theta[0]))
        except ZeroDivisionError:
            self.theta[0] = 1.0 / len(self.full_cache)

        try:
            self.theta[1] = ((R * self.tau[1]) /
                             (R * self.tau[1] + self.acc_theta[1]))
        except ZeroDivisionError:
            self.theta[1] = 1.0 / len(self.full_cache)

    def _update_tau_and_theta_accs(self, Z, depth, rank, increment=True):
        if increment:
            self.acc_tau[0] += Z
            self.acc_theta[0] += Z * depth
            self.acc_theta[1] += (1.0 - Z) * rank
        else:
            self.acc_tau[0] -= Z
            self.acc_theta[0] -= Z * depth
            self.acc_theta[1] -= (1.0 - Z) * rank
            self.acc_theta = [max(0.0, acc) for acc in self.acc_theta]

    def update_tau_and_theta_accs(self, record, increment=True):
        depth = record.depth

        if record.node.is_purged:
            rank = record.node.rank_purge_memo
        else:
            rank = record.node.rank

        self._update_tau_and_theta_accs(record.Z, depth, rank, increment)

    def evict(self, node):
        self.evict_datapoint['row'] += 1
        self.evict_datapoint['depth'] = node.depth
        self.evict_datapoint['rank'] = node.rank
        self.evict_datapoint['value'] = node.expected_value
        self.evict_datapoint['Z'] = self.calculate_Z(node.depth, node.rank)
        self.evict_datapoint['tau'] = self.tau[0]
        self.evict_writer.writerow(
            [self.evict_datapoint[key] for key in self.evict_order])
        self.evict_file.flush()
        self.num_in_cache -= 1
        node.is_evicted = True

    def purge(self, node):
        self.purge_datapoint['row'] += 1
        self.purge_datapoint['depth'] = node.depth
        self.purge_datapoint['rank'] = node.rank
        self.purge_datapoint['value'] = node.expected_value
        self.purge_datapoint['Z'] = self.calculate_Z(node.depth, node.rank)
        self.purge_writer.writerow(
            [self.purge_datapoint[key] for key in self.purge_order])
        self.purge_file.flush()
        self.num_in_full_cache -= 1
        node.purge()

    @property
    def cache_list(self):
        return filter(lambda node: not node.is_evicted, self.full_cache_list)

    @property
    def full_cache_list(self):
        return list(self.full_cache.values())

    def hit_rate(self):
        return float(self.num_hits) / self.num_requests

    def get_node(self, page):
        try:
            node = self.full_cache[page]
            return node
        except KeyError:
            return None
Пример #9
0
class TDigest(object):
    def __init__(self, delta=0.01, K=25):
        self.C = RBTree()
        self.n = 0
        self.delta = delta
        self.K = K

    def __add__(self, other_digest):
        data = list(chain(self.C.values(), other_digest.C.values()))
        new_digest = TDigest(self.delta, self.K)

        if len(data) > 0:
            for c in pyudorandom.items(data):
                new_digest.update(c.mean, c.count)

        return new_digest

    def __len__(self):
        return len(self.C)

    def __repr__(self):
        return """<T-Digest: n=%d, centroids=%d>""" % (self.n, len(self))

    def _add_centroid(self, centroid):
        if centroid.mean not in self.C:
            self.C.insert(centroid.mean, centroid)
        else:
            self.C[centroid.mean].update(centroid.mean, centroid.count)

    def _compute_centroid_quantile(self, centroid):
        denom = self.n
        cumulative_sum = sum(
            c_i.count
            for c_i in self.C.value_slice(-float('Inf'), centroid.mean))
        return (centroid.count / 2. + cumulative_sum) / denom

    def _update_centroid(self, centroid, x, w):
        self.C.pop(centroid.mean)
        centroid.update(x, w)
        self._add_centroid(centroid)

    def _find_closest_centroids(self, x):
        try:
            ceil_key = self.C.ceiling_key(x)
        except KeyError:
            floor_key = self.C.floor_key(x)
            return [self.C[floor_key]]

        try:
            floor_key = self.C.floor_key(x)
        except KeyError:
            ceil_key = self.C.ceiling_key(x)
            return [self.C[ceil_key]]

        if abs(floor_key - x) < abs(ceil_key - x):
            return [self.C[floor_key]]
        elif abs(floor_key - x) == abs(ceil_key -
                                       x) and (ceil_key != floor_key):
            return [self.C[ceil_key], self.C[floor_key]]
        else:
            return [self.C[ceil_key]]

    def _theshold(self, q):
        return 4 * self.n * self.delta * q * (1 - q)

    def update(self, x, w=1):
        """
        Update the t-digest with value x and weight w.

        """
        self.n += w

        if len(self) == 0:
            self._add_centroid(Centroid(x, w))
            return

        S = self._find_closest_centroids(x)

        while len(S) != 0 and w > 0:
            j = choice(list(range(len(S))))
            c_j = S[j]

            q = self._compute_centroid_quantile(c_j)

            # This filters the out centroids that do not satisfy the second part
            # of the definition of S. See original paper by Dunning.
            if c_j.count + w > self._theshold(q):
                S.pop(j)
                continue

            delta_w = min(self._theshold(q) - c_j.count, w)
            self._update_centroid(c_j, x, delta_w)
            w -= delta_w
            S.pop(j)

        if w > 0:
            self._add_centroid(Centroid(x, w))

        if len(self) > self.K / self.delta:
            self.compress()

        return

    def batch_update(self, values, w=1):
        """
        Update the t-digest with an iterable of values. This assumes all points have the 
        same weight.
        """
        for x in values:
            self.update(x, w)
        self.compress()
        return

    def compress(self):
        T = TDigest(self.delta, self.K)
        C = list(self.C.values())
        for c_i in pyudorandom.items(C):
            T.update(c_i.mean, c_i.count)
        self.C = T.C

    def percentile(self, p):
        """ 
        Computes the percentile of a specific value in [0,100].

        """
        if not (0 <= p <= 100):
            raise ValueError("p must be between 0 and 100, inclusive.")

        t = 0
        p = float(p) / 100.
        p *= self.n

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k = c_i.count
            if p < t + k:
                if i == 0:
                    return c_i.mean
                elif i == len(self) - 1:
                    return c_i.mean
                else:
                    delta = (self.C.succ_item(key)[1].mean -
                             self.C.prev_item(key)[1].mean) / 2.
                return c_i.mean + ((p - t) / k - 0.5) * delta

            t += k
        return self.C.max_item()[1].mean

    def quantile(self, q):
        """ 
        Computes the quantile of a specific value, ie. computes F(q) where F denotes
        the CDF of the distribution. 

        """
        t = 0
        N = float(self.n)

        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            if i == len(self) - 1:
                delta = (c_i.mean - self.C.prev_item(key)[1].mean) / 2.
            else:
                delta = (self.C.succ_item(key)[1].mean - c_i.mean) / 2.
            z = max(-1, (q - c_i.mean) / delta)

            if z < 1:
                return t / N + c_i.count / N * (z + 1) / 2

            t += c_i.count
        return 1

    def trimmed_mean(self, p1, p2):
        """
        Computes the mean of the distribution between the two percentiles p1 and p2.
        This is a modified algorithm than the one presented in the original t-Digest paper. 

        """
        if not (p1 < p2):
            raise ValueError("p1 must be between 0 and 100 and less than p2.")

        s = k = t = 0
        p1 /= 100.
        p2 /= 100.
        p1 *= self.n
        p2 *= self.n
        for i, key in enumerate(self.C.keys()):
            c_i = self.C[key]
            k_i = c_i.count
            if p1 < t + k_i:
                if i == 0:
                    delta = self.C.succ_item(key)[1].mean - c_i.mean
                elif i == len(self) - 1:
                    delta = c_i.mean - self.C.prev_item(key)[1].mean
                else:
                    delta = (self.C.succ_item(key)[1].mean -
                             self.C.prev_item(key)[1].mean) / 2.
                nu = ((p1 - t) / k_i - 0.5) * delta
                s += nu * k_i * c_i.mean
                k += nu * k_i

            if p2 < t + k_i:
                return s / k
            t += k_i

        return s / k