def test_popitem_ties(self): h = HeapDict() for i in range(N): h[i] = 0 for i in range(N): k, v = h.popitem() self.assertEqual(v, 0) self.check_invariants(h)
def __init__(self, k, epsilon, delta, seed=None): """ Setup a new count-min sketch with parameters num_levels, epsilon, and delta. The parameters epsilon and delta control the accuracy of the estimates of the sketch Cormode and Muthukrishnan prove that for an item i with count a_i, the estimate from the sketch a_i_hat will satisfy the relation a_hat_i <= a_i + epsilon * ||a||_1 with probability at least 1 - delta, where a is the the vector of all all counts and ||x||_1 is the L1 norm of a vector x Parameters ---------- k : int A positive integer that sets the number of top items counted epsilon : float A value in the unit interval that sets the precision of the sketch delta : float A value in the unit interval that sets the precision of the sketch Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) Raises ------ ValueError If if k is not a positive integer, or epsilon or delta are not in the unit interval. """ seed = seed or 1729 random.seed(seed) if k < 1: raise ValueError("k must be a positive integer") if epsilon <= 0 or epsilon >= 1: raise ValueError("epsilon must be between 0 and 1, exclusive") if delta <= 0 or delta >= 1: raise ValueError("delta must be between 0 and 1, exclusive") self.k = k self.width = int(math.ceil(math.exp(1) / epsilon)) self.depth = int(math.ceil(math.log(1 / delta))) self.hash_function_params = [ _generate_hash_function_params() for _ in range(self.depth) ] self.count = array.array('i', itertools.repeat(0, self.depth * self.width)) self.heap = HeapDict()
def __init__(self, graph: Graph, start, key=None): """ Create a new DijkstraIterator object. :param graph: the graph to iterate over :param start: the first node to visit :param key: a function of one argument used to extract a comparison key to determine which node to visit first in the case of a tie (the "smallest" element) :raises ValueError: if start is not defined in graph """ super().__init__(graph, start, key=key) self._worklist = HeapDict() for u in graph.nodes(): self._worklist[u] = math.inf self._worklist[start] = 0
def make_data(): pairs = [(random.random(), random.random()) for _ in range(N)] h = HeapDict() d = {} for k, v in pairs: h[k] = v d[k] = v pairs.sort(key=lambda x: x[1], reverse=True) return h, pairs, d
def __init__(self, k, epsilon, delta, seed=None): """ Setup a new count-min sketch with parameters num_levels, epsilon, and delta. The parameters epsilon and delta control the accuracy of the estimates of the sketch Cormode and Muthukrishnan prove that for an item i with count a_i, the estimate from the sketch a_i_hat will satisfy the relation a_hat_i <= a_i + epsilon * ||a||_1 with probability at least 1 - delta, where a is the the vector of all all counts and ||x||_1 is the L1 norm of a vector x Parameters ---------- k : int A positive integer that sets the number of top items counted epsilon : float A value in the unit interval that sets the precision of the sketch delta : float A value in the unit interval that sets the precision of the sketch Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) Raises ------ ValueError If if k is not a positive integer, or epsilon or delta are not in the unit interval. """ seed = seed or 1729 random.seed(seed) if k < 1: raise ValueError("k must be a positive integer") if epsilon <= 0 or epsilon >= 1: raise ValueError("epsilon must be between 0 and 1, exclusive") if delta <= 0 or delta >= 1: raise ValueError("delta must be between 0 and 1, exclusive") self.k = k self.width = int(math.ceil(math.exp(1) / epsilon)) self.depth = int(math.ceil(math.log(1 / delta))) self.hash_function_params = [_generate_hash_function_params() for _ in range(self.depth)] self.count = array.array('i', itertools.repeat(0, self.depth * self.width)) self.heap = HeapDict()
class DijkstraIterator(WeightedGraphIterator): """ Iterate over the nodes of a graph based on their distance from the given start node using Dijkstra's shortest path algorithm. """ def __init__(self, graph: Graph, start, key=None): """ Create a new DijkstraIterator object. :param graph: the graph to iterate over :param start: the first node to visit :param key: a function of one argument used to extract a comparison key to determine which node to visit first in the case of a tie (the "smallest" element) :raises ValueError: if start is not defined in graph """ super().__init__(graph, start, key=key) self._worklist = HeapDict() for u in graph.nodes(): self._worklist[u] = math.inf self._worklist[start] = 0 def _visit_next(self) -> Optional[Node]: try: (u, d_u) = self._worklist.popitem() except KeyError: return None, math.inf neighbors = list(self._graph.neighbors(u)) neighbors.sort(key=self._key) for v in neighbors: if v in self._worklist: d_v = self._worklist[v] l_uv = self._graph.weight(u, v) if d_v > d_u + l_uv: self._worklist[v] = d_u + l_uv return u, d_u
class FreqSketch(object): def __init__(self, k, epsilon, delta, seed=None): """ Setup a new count-min sketch with parameters num_levels, epsilon, and delta. The parameters epsilon and delta control the accuracy of the estimates of the sketch Cormode and Muthukrishnan prove that for an item i with count a_i, the estimate from the sketch a_i_hat will satisfy the relation a_hat_i <= a_i + epsilon * ||a||_1 with probability at least 1 - delta, where a is the the vector of all all counts and ||x||_1 is the L1 norm of a vector x Parameters ---------- k : int A positive integer that sets the number of top items counted epsilon : float A value in the unit interval that sets the precision of the sketch delta : float A value in the unit interval that sets the precision of the sketch Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) Raises ------ ValueError If if k is not a positive integer, or epsilon or delta are not in the unit interval. """ seed = seed or 1729 random.seed(seed) if k < 1: raise ValueError("k must be a positive integer") if epsilon <= 0 or epsilon >= 1: raise ValueError("epsilon must be between 0 and 1, exclusive") if delta <= 0 or delta >= 1: raise ValueError("delta must be between 0 and 1, exclusive") self.k = k self.width = int(math.ceil(math.exp(1) / epsilon)) self.depth = int(math.ceil(math.log(1 / delta))) self.hash_function_params = [_generate_hash_function_params() for _ in range(self.depth)] self.count = array.array('i', itertools.repeat(0, self.depth * self.width)) self.heap = HeapDict() def hash_index(self, row, column): return self.width * row + column def _check_compatibility(self, other): """Check if another FreqSketch is compatible with this one for merge. Compatibility requires same width, depth, and hash_functions. """ if self.width != other.width or self.depth != other.depth: raise ValueError("FreqSketch dimensions do not match.") if self.hash_function_params != other.hash_function_params: raise ValueError("FreqSketch hashes do not match") def increment(self, key): """ Increments the sketch for the item with name of key. Parameters ---------- key : string The item to update the value of in the sketch Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) >>> s.increment('http://www.cnn.com/') """ self.update(key, 1) def _hash_function(self, x, params): a, b = params res = (a * x + b) % BIG_PRIME % self.width return res def _update_sketch(self, key, increment): for row, hash_function_params in enumerate(self.hash_function_params): column = self._hash_function(abs(hash(key)), hash_function_params) self.count[self.hash_index(row, column)] += increment def update(self, key, increment): """ Updates the sketch for the item with name of key by the amount specified in increment Parameters ---------- key : string The item to update the value of in the sketch increment : integer The amount to update the sketch by for the given key Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) >>> s.update('http://www.cnn.com/', 1) """ self._update_sketch(key, increment) self.update_heap(key) def update_heap(self, key): """ Updates the class's heap that keeps track of the top k items for a given key For the given key, it either adds the key or updates its estimate, if its current estimate is larger then the smallest element in the heap (or if the heap is not already full). Parameters ---------- key : string The item to check against the heap """ estimate = self.get(key) # smallest element is found by peekitem() if len(self.heap) < self.k or estimate >= self.heap.peekitem()[1][0]: self.heap[key] = [estimate, key] if len(self.heap) > self.k: self.heap.popitem() def get(self, key): """ Fetches the sketch estimate for the given key Parameters ---------- key : string The item to produce an estimate for Returns ------- estimate : int The best estimate of the count for the given key based on the sketch Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) >>> s.update('http://www.cnn.com/', 1) >>> s.get('http://www.cnn.com/') 1 """ value = sys.maxint for row, hash_function_params in enumerate(self.hash_function_params): column = self._hash_function(abs(hash(key)), hash_function_params) value = min(self.count[self.hash_index(row, column)], value) return value def frequent_items(self): """ Returns the most frequent items. These are the frequent items from the heap. """ return {key: self.get(key) for key in self.heap} def iterate_values(self, value_iterator): """Makes FreqSketch usable with PySpark mapPartitions(). An RDD's mapPartitions method takes a function that consumes an iterator of records and spits out an iterable for the next RDD downstream. Parameters ---------- value_iterator : iterator Produces the values whose frequency is to be counted. """ for value in value_iterator: self.increment(value) yield self @staticmethod def initial_accumulator_value(): """ Initial value used with aggregate function. """ return dict() @staticmethod def merge_accumulator_value(acc, value): """ Add an accumulator and a value, for use with aggregate. Parameters ---------- acc : dict An accumulator of frequent values. value : FreqSketch Contains a set of frequent values to merge with acc. Returns ------- out : dict An accumulator of frequency values. """ return FreqSketch.merge_accumulators(acc, value.frequent_items()) @staticmethod def merge_accumulators(acc1, acc2): """ Merge two accumulators, for use with aggregate. Parameters ---------- acc1 : dict One set of frequent values to merge acc2 : dict The other set of frequent values. Returns ------- out : dict An accumulator of frequency values. The result is one accumulator with all values in both accumulators. Where there are results in each accumulator, they are summed. Notes ----- If the dictionaries contain keys of float('nan') then this will not work. To begin with, dictionaries treat different instances of float('nan') as distinct so there may be many keys that look alike. Even if you use the singleton np.nan, spark serialization does not seem to preserve this property. It is recommended that the caller transform NaN into None before doing frequency counts to work around this limitation. """ ans = dict(acc1) for key in acc2: ans[key] = ans[key] + acc2[key] if key in ans else acc2[key] return ans
def test_peekitem_empty(self): h = HeapDict() self.assertRaises(KeyError, h.peekitem)
class FreqSketch(object): def __init__(self, k, epsilon, delta, seed=None): """ Setup a new count-min sketch with parameters num_levels, epsilon, and delta. The parameters epsilon and delta control the accuracy of the estimates of the sketch Cormode and Muthukrishnan prove that for an item i with count a_i, the estimate from the sketch a_i_hat will satisfy the relation a_hat_i <= a_i + epsilon * ||a||_1 with probability at least 1 - delta, where a is the the vector of all all counts and ||x||_1 is the L1 norm of a vector x Parameters ---------- k : int A positive integer that sets the number of top items counted epsilon : float A value in the unit interval that sets the precision of the sketch delta : float A value in the unit interval that sets the precision of the sketch Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) Raises ------ ValueError If if k is not a positive integer, or epsilon or delta are not in the unit interval. """ seed = seed or 1729 random.seed(seed) if k < 1: raise ValueError("k must be a positive integer") if epsilon <= 0 or epsilon >= 1: raise ValueError("epsilon must be between 0 and 1, exclusive") if delta <= 0 or delta >= 1: raise ValueError("delta must be between 0 and 1, exclusive") self.k = k self.width = int(math.ceil(math.exp(1) / epsilon)) self.depth = int(math.ceil(math.log(1 / delta))) self.hash_function_params = [ _generate_hash_function_params() for _ in range(self.depth) ] self.count = array.array('i', itertools.repeat(0, self.depth * self.width)) self.heap = HeapDict() def hash_index(self, row, column): return self.width * row + column def _check_compatibility(self, other): """Check if another FreqSketch is compatible with this one for merge. Compatibility requires same width, depth, and hash_functions. """ if self.width != other.width or self.depth != other.depth: raise ValueError("FreqSketch dimensions do not match.") if self.hash_function_params != other.hash_function_params: raise ValueError("FreqSketch hashes do not match") def increment(self, key): """ Increments the sketch for the item with name of key. Parameters ---------- key : string The item to update the value of in the sketch Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) >>> s.increment('http://www.cnn.com/') """ self.update(key, 1) def _hash_function(self, x, params): a, b = params res = (a * x + b) % BIG_PRIME % self.width return res def _update_sketch(self, key, increment): for row, hash_function_params in enumerate(self.hash_function_params): column = self._hash_function(abs(hash(key)), hash_function_params) self.count[self.hash_index(row, column)] += increment def update(self, key, increment): """ Updates the sketch for the item with name of key by the amount specified in increment Parameters ---------- key : string The item to update the value of in the sketch increment : integer The amount to update the sketch by for the given key Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) >>> s.update('http://www.cnn.com/', 1) """ self._update_sketch(key, increment) self.update_heap(key) def update_heap(self, key): """ Updates the class's heap that keeps track of the top k items for a given key For the given key, it either adds the key or updates its estimate, if its current estimate is larger then the smallest element in the heap (or if the heap is not already full). Parameters ---------- key : string The item to check against the heap """ estimate = self.get(key) # smallest element is found by peekitem() if len(self.heap) < self.k or estimate >= self.heap.peekitem()[1][0]: self.heap[key] = [estimate, key] if len(self.heap) > self.k: self.heap.popitem() def get(self, key): """ Fetches the sketch estimate for the given key Parameters ---------- key : string The item to produce an estimate for Returns ------- estimate : int The best estimate of the count for the given key based on the sketch Examples -------- >>> s = FreqSketch(40, 0.005, 10**-7) >>> s.update('http://www.cnn.com/', 1) >>> s.get('http://www.cnn.com/') 1 """ value = sys.maxint for row, hash_function_params in enumerate(self.hash_function_params): column = self._hash_function(abs(hash(key)), hash_function_params) value = min(self.count[self.hash_index(row, column)], value) return value def frequent_items(self): """ Returns the most frequent items. These are the frequent items from the heap. """ return {key: self.get(key) for key in self.heap} def iterate_values(self, value_iterator): """Makes FreqSketch usable with PySpark mapPartitions(). An RDD's mapPartitions method takes a function that consumes an iterator of records and spits out an iterable for the next RDD downstream. Parameters ---------- value_iterator : iterator Produces the values whose frequency is to be counted. """ for value in value_iterator: self.increment(value) yield self @staticmethod def initial_accumulator_value(): """ Initial value used with aggregate function. """ return dict() @staticmethod def merge_accumulator_value(acc, value): """ Add an accumulator and a value, for use with aggregate. Parameters ---------- acc : dict An accumulator of frequent values. value : FreqSketch Contains a set of frequent values to merge with acc. Returns ------- out : dict An accumulator of frequency values. """ return FreqSketch.merge_accumulators(acc, value.frequent_items()) @staticmethod def merge_accumulators(acc1, acc2): """ Merge two accumulators, for use with aggregate. Parameters ---------- acc1 : dict One set of frequent values to merge acc2 : dict The other set of frequent values. Returns ------- out : dict An accumulator of frequency values. The result is one accumulator with all values in both accumulators. Where there are results in each accumulator, they are summed. Notes ----- If the dictionaries contain keys of float('nan') then this will not work. To begin with, dictionaries treat different instances of float('nan') as distinct so there may be many keys that look alike. Even if you use the singleton np.nan, spark serialization does not seem to preserve this property. It is recommended that the caller transform NaN into None before doing frequency counts to work around this limitation. """ ans = dict(acc1) for key in acc2: ans[key] = ans[key] + acc2[key] if key in ans else acc2[key] return ans