def update_statistics(): global N, H depth = 10 width = 40000 hash_functions = [hash_function(i) for i in range(depth)] sketch = CountMinSketch(depth, width, hash_functions, M=N) for fp_key in H: ef = H[fp_key][0] rf = H[fp_key][1] df = H[fp_key][2] sketch.add(fp_key, rf + df + ef) system('clear') flows_to_display = [] for flow in distinctFlows: flows_to_display.append((flow, sketch.query(flow))) for flow in H.keys(): flows_to_display.append((flow, sketch.query(flow))) top_flows = sorted(flows_to_display, key=lambda x: x[1], reverse=True)[0:20] for flow in top_flows: print flow print "Total flows:" + str(len(distinctFlows) + len(H.keys()))
def test_simple_usage(self): N = 1000 sketch = CountMinSketch(10, 5) for _ in xrange(N): sketch.add("a") self.assertEqual(sketch.query("a"), N) self.assertEqual(sketch.query("b"), 0) self.assertEqual(len(sketch), N)
class History(object): def __init__(self, n, m, d): # time counter (update for each new unit) self.t = 0 # n is number of CM sketches self.n = n # m is size of array for each hash function self.m = m # d is number of hash functions self.d = d # present is a count-min sketch containing # sub-unit time counts of indexes. self.present = CMSketch(m, d) # ready is a t/f value to determine whether or not # to use the present as a score while the aggregate weighted score # is being computed self.ready = False # use a CM-sketch to keep track of aggregate weighted score # A = sum{j = 1 to log T} (M^j / 2^j) # (we add the present ourselves) # keep track of A at every time interval # initialized to zero self.aggregate_score = CMSketch(m, d) # n count-min sketches # we retain resolutions 1, 2, 4, ..., 2^n # move to next sketch (update curr_sketch) when # time unit filled = 2^i (its position in the list) self.cm_sketch_list = [] for i in range(n): self.cm_sketch_list.append(CMSketch(m, d)) def update_present_only(self, datum): self.ready = False # don't update the full time # this is a sub-unit update self.present.add(datum, 1) # data_block is a block of data, presented as an iterable object # the block of data consists of data that arrived in a single time unit # implements algorithm 2 from the paper # this structures maintains n CM-sketches, M0, M1, ..., Mn # M0 always holds [t-1, t] where t is current time # M1 always holds [t - tmod2 - 2, t - tmod2] # ... # Mn always holds [t - tmod(2^n) - 2^n, t - tmod(2^n)] # for t = 8, for example: # M0: [7, 8] # M1: [6, 8] # M2: [4, 8] # M3: [0, 8] # rest: 0 def aggregate_unit(self, data_block): # update time once per unit self.t += 1 # we use this to keep track of the current time unit # convert the data_block into a CM sketch accumulator = CMSketch(self.m, self.d) # add each hashtag in the data_block to the CM sketch # while this data is coming in, we maintain a separate # data structure with the exact frequencies that we can # query for exact frequencies. # with frequency 1 for each appearance # reset the present when we aggregate the whole thing self.present = CMSketch(self.m, self.d) # (data_block is the present) for data in data_block: accumulator.add(data, 1) # update present as we update the accumulator self.present.add(data, 1) self.ready = False # we update the whole structure with M_bar # we calculate l: # l = max over all i such that (t mod 2^i) == 0 # efficient -- takes log t time to find at worst def find_l(t): l = 0 if t == 0: return l while t % 2 == 0: l += 1 t = t/2 return l # go up to the index that is find_l + 1, or the max index # if find_l + 1 >= to it for i in range(min(find_l(self.t) + 1, self.n)): # now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j) # M_bar - M^j difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1)) # A = A + (1/2)^i difference self.aggregate_score = sketch_sum(self.aggregate_score, sketch_scalar_product(difference, pow(0.5, i))) # temporary storage T = deepcopy(accumulator) # aggregate into accumulator for next round accumulator = sketch_sum(accumulator, self.cm_sketch_list[i]) # set the value self.cm_sketch_list[i] = T # now we're ready to use CM-sketch values self.ready = True # reset the present now that we're done with one time block self.present = CMSketch(self.m, self.d) # we want to put these values into its own count-min sketch, (call it A) # updated in sync so as to not waste log T time summing # for each query. # this value will provide a key for our heap def query_slow(self, x): return self.present.query(x) + sum(pow(0.5, i) * self.cm_sketch_list[i].query(x) for i in range(self.n)) # using a CMSketch to keep track of the score # note that we stored the 'scores' we calculated in CM-sketch # therefore it will pick the minimum of these # this is exactly equivalent to doing the sum over the minimums since we added termwise # (used matrix addition and scalar multiplication) def query(self, x): if self.ready: return self.aggregate_score.query(x) else: # only if we're not ready return self.present.query(x) + self.aggregate_score.query(x)
def test_zero_at_start(self): sketch = CountMinSketch(10, 5) for thing in (0, 1, -1, tuple, tuple(), "", "yeah", object()): self.assertEqual(sketch.query(thing), 0)
def test_add_greater_than_one(self): sketch = CountMinSketch(10, 5) sketch.add("a", 123) self.assertEqual(sketch.query("a"), 123)
def test_syntax_sugar(self): sketch = CountMinSketch(10, 5) self.assertEqual(sketch.query("a"), sketch["a"]) sketch.add("a") self.assertEqual(sketch.query("a"), sketch["a"])