def _make_sketch(kmer_counts_dict: defaultdict) -> CountMinSketch: # Read the dictionary into a compressed data structure to allow deleting kmer_counts_dict NUM_ROWS = 8 kmer_counts = CountMinSketch(NUM_ROWS) for kmer, count in kmer_counts_dict.items(): kmer_counts.update(kmer, count) return kmer_counts
def runner(input_file_name, config): # read the list of integers # with open(input_file_name) as f: # lst = list(map(lambda l: int(l.strip()), f.readlines())) sketch = CountMinSketch(config["m"], config["d"]) lst = AxProf.zipfGenerator(config["n"], config["skew"], int(time.time() * 1000) % 2**32) # measure the running time startTime = time.time() for num in lst: sketch.add(num) endTime = time.time() actual_map = actual_count(lst) error_map = {} for num in set(lst): error_map[num] = abs(actual_map[num] - sketch[num]) return { "input": lst, "acc": error_map, "time": config["m"] * config["d"], # endTime - startTime, "space": 0, }
def sketch_scalar_product(M, c): result_sketch = CMSketch(M.m, M.d) for i in range(M.d): for j in range(M.m): new_val = c*M.val_at(i, j) result_sketch.update(i, j, new_val) return result_sketch
def runner(input_file_name, config): # read the list of integers with open(input_file_name) as f: lst = list(map(lambda l: int(l.strip()), f.readlines())) m = math.ceil(math.e / config["eps"]) d = math.ceil(ln(1 / config["delta"])) # print(m, d) sketch = CountMinSketch(m, d) # measure the running time startTime = time.time() for num in lst: sketch.add(num) endTime = time.time() result = {} for num in lst: result[num] = sketch[num] return { "acc": result, "time": endTime - startTime, "space": 0, }
def __init__(self, n, m, d): # time counter (update for each new unit) self.t = 0 # n is number of CM sketches self.n = n # m is size of array for each hash function self.m = m # d is number of hash functions self.d = d # present is a count-min sketch containing # sub-unit time counts of indexes. self.present = CMSketch(m, d) # ready is a t/f value to determine whether or not # to use the present as a score while the aggregate weighted score # is being computed self.ready = False # use a CM-sketch to keep track of aggregate weighted score # A = sum{j = 1 to log T} (M^j / 2^j) # (we add the present ourselves) # keep track of A at every time interval # initialized to zero self.aggregate_score = CMSketch(m, d) # n count-min sketches # we retain resolutions 1, 2, 4, ..., 2^n # move to next sketch (update curr_sketch) when # time unit filled = 2^i (its position in the list) self.cm_sketch_list = [] for i in range(n): self.cm_sketch_list.append(CMSketch(m, d))
def __init__(self, delta, epsilon): self.m = int(math.ceil(math.exp(1) / epsilon)) self.d = int(math.ceil(math.log(1 / delta))) CountMinSketch.__init__(self, m, d) self.bitarray = np.zeros((self.nbr_slices, self.bits_per_slice), dtype=np.int32) self.make_hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices)
def test_counts_overestimate(self): text = open(__file__).read() counter = Counter(text) sketch = CountMinSketch(10, 5) for x in text: sketch.add(x) for x in set(text): self.assertGreaterEqual(sketch[x], counter[x])
def test_simple_usage(self): N = 1000 sketch = CountMinSketch(10, 5) for _ in xrange(N): sketch.add("a") self.assertEqual(sketch.query("a"), N) self.assertEqual(sketch.query("b"), 0) self.assertEqual(len(sketch), N)
def aggregate_unit(self, data_block): # update time once per unit self.t += 1 # we use this to keep track of the current time unit # convert the data_block into a CM sketch accumulator = CMSketch(self.m, self.d) # add each hashtag in the data_block to the CM sketch # while this data is coming in, we maintain a separate # data structure with the exact frequencies that we can # query for exact frequencies. # with frequency 1 for each appearance # reset the present when we aggregate the whole thing self.present = CMSketch(self.m, self.d) # (data_block is the present) for data in data_block: accumulator.add(data, 1) # update present as we update the accumulator self.present.add(data, 1) self.ready = False # we update the whole structure with M_bar # we calculate l: # l = max over all i such that (t mod 2^i) == 0 # efficient -- takes log t time to find at worst def find_l(t): l = 0 if t == 0: return l while t % 2 == 0: l += 1 t = t/2 return l # go up to the index that is find_l + 1, or the max index # if find_l + 1 >= to it for i in range(min(find_l(self.t) + 1, self.n)): # now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j) # M_bar - M^j difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1)) # A = A + (1/2)^i difference self.aggregate_score = sketch_sum(self.aggregate_score, sketch_scalar_product(difference, pow(0.5, i))) # temporary storage T = deepcopy(accumulator) # aggregate into accumulator for next round accumulator = sketch_sum(accumulator, self.cm_sketch_list[i]) # set the value self.cm_sketch_list[i] = T # now we're ready to use CM-sketch values self.ready = True # reset the present now that we're done with one time block self.present = CMSketch(self.m, self.d)
def simulate_sfcmon(df, num_chunks): print("#################### PROCESSING CHUNKS #########################") num_packets = df.shape[0] print("# Number of packets = {}".format(num_packets)) chunks = split(df, num_packets / num_chunks) results = {"packets": set(), "size": set()} for c in chunks: print("# Chunk Data: {}; {}".format(c.shape, c.index)) sketch_packets = CountMinSketch( 5436, 5) # table size=1000, hash functions=10 sketch_size = CountMinSketch(5436, 5) # table size=1000, hash functions=10 count_packets = 0 count_size = 0 for row in zip(c["SrcIP"], c["Size"]): flow_id = row[0] sketch_packets.add(flow_id) sketch_size.add(flow_id, value=row[1]) count_packets += 1 count_size += row[1] if count_packets > num_rows_to_start: hh_threshold_packets = count_packets * bound hh_threshold_size = count_size * bound if sketch_packets[flow_id] > hh_threshold_packets: results["packets"].add(flow_id) if sketch_size[flow_id] > hh_threshold_size: results["size"].add(flow_id) return results
def show_statistics(): while True: H1 = copy.deepcopy(H) distinctFlows1 = copy.deepcopy(distinctFlows) N1 = copy.deepcopy(N) depth = 10 width = 40000 hash_functions = [hash_function(i) for i in range(depth)] sketch1 = CountMinSketch(depth, width, hash_functions, M=N1) for fp_key in H1: ef = H1[fp_key][0] rf = H1[fp_key][1] df = H1[fp_key][2] sketch1.add(fp_key, rf + df + ef) time.sleep(1) sketch = CountMinSketch(depth, width, hash_functions, M=N) for fp_key in H: ef = H[fp_key][0] rf = H[fp_key][1] df = H[fp_key][2] sketch.add(fp_key, rf + df + ef) top_flows = get_top_flows(sketch, sketch1) system('clear') print " flow rate" for flow in top_flows: print "#" + str(flow[0]) + " :: " + str(flow[1]) + "b/s"
def sketch_sum(M1, M2): if M1.m != M2.m: print "Sketches don't align on hashtable length.\n" return elif M1.d != M2.d: print "Sketches don't align on # of hashtables.\n" return else: result_sketch = CMSketch(M2.m, M2.d) for i in range(M2.d): for j in range(M2.m): new_val = M1.val_at(i, j) + M2.val_at(i, j) result_sketch.update(i, j, new_val) return result_sketch
def test_compute_width_depth(): delta = 0.1 epsilon = 0.1 depth, width = CountMinSketch.compute_depth_width(delta, epsilon) assert width == 2 assert depth == 20
def plot_error(lst, m, d, color=None, label=None): sketch = CountMinSketch(m, d) actual_map = actual_count(lst) for i in lst: sketch.add(i) errors = [] unique = set(lst) for i in unique: actual = actual_map[i] error = abs(actual - sketch[i]) errors.append(error) print(len(errors)) plot.hist(errors, bins=len(errors) // 100, color=color, label=label) plot.xlim(0, int(len(lst) * 0.1))
def simulate_rtp4mon(df, bound, pkts_to_start): print("# " + str(datetime.datetime.now()) + " - Begin simulation RTP4Mon...") results = {"packets": set(), "5t_packets": set()} sketch_packets = CountMinSketch(5436, 5) # table size=1000, hash functions=10 count_packets = 0 for row in zip(df["SrcIP"], df["DstIP"], df["SrcPort"], df["DstPort"], df["Proto"]): flow_id = row[0] five_flow_id = row sketch_packets.add(flow_id) count_packets += 1 if count_packets > pkts_to_start: hh_threshold_packets = count_packets * bound if sketch_packets[flow_id] > hh_threshold_packets: results["packets"].add(flow_id) results["5t_packets"].add(five_flow_id) print("# " + str(datetime.datetime.now()) + " - End simulation RTP4Mon...") return results
def update_statistics(): global N, H depth = 10 width = 40000 hash_functions = [hash_function(i) for i in range(depth)] sketch = CountMinSketch(depth, width, hash_functions, M=N) for fp_key in H: ef = H[fp_key][0] rf = H[fp_key][1] df = H[fp_key][2] sketch.add(fp_key, rf + df + ef) system('clear') flows_to_display = [] for flow in distinctFlows: flows_to_display.append((flow, sketch.query(flow))) for flow in H.keys(): flows_to_display.append((flow, sketch.query(flow))) top_flows = sorted(flows_to_display, key=lambda x: x[1], reverse=True)[0:20] for flow in top_flows: print flow print "Total flows:" + str(len(distinctFlows) + len(H.keys()))
def _make_sketch(self, kmer_counts_dict: defaultdict) -> CountMinSketch: if self.print_runtime: print("\n>--- STARTING TO MAKE COUNTMIN SKETCH AT T = {:.2f} ---". format(time.time() - self.start_time)) # Read the dictionary into a compressed data structure NUM_ROWS = 10 kmer_counts = CountMinSketch(NUM_ROWS) for i, (kmer, count) in enumerate(kmer_counts_dict.items()): if self.print_runtime and i % 50000 == 0: print(">Processed {0} kmers by time T={1:.2f}".format( i, time.time() - self.start_time)) kmer_counts.update(kmer, count) if self.print_runtime: print(">FINISHED MAKING COUNTMIN SKETCH AT T = {:.2f}".format( time.time() - self.start_time)) if self.print_syssizeof: print(">SIZE OF COUNTMIN SKETCH: {:,}".format( sys.getsizeof(kmer_counts))) return kmer_counts
def test_increment_and_estimate(): word1 = 'hello' word2 = 'world' word3 = 'other' countmin = CountMinSketch(0.1, 0.1) countmin.increment(word1) countmin.increment(word2) countmin.increment(word2) assert countmin.estimate(word3) == 0 assert countmin.estimate(word1) == 1 assert countmin.estimate(word2) == 2 top_10_list = [('hello', 1.0), ('world', 2.0)] assert set(countmin.top_10_dict.items()) == set(top_10_list)
def worker(index, path): global counter """ :param index: the index of the dump this worker should work on. :return: """ print "Process %d start processing" % index with open("%s/wiki_0%s" % (path, index), "r") as f: batch = Counter() batch_limit = 10000 sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS) current = datetime.now().date() for line in f: # Extrat timestamp from header if line[:4] == "<doc": m = TIMESTEMP_RE.search(line) if m: current = datetime.strptime(m.group(1), "%Y-%m-%dT%H:%M:%SZ").date() continue elif line[:5] == "</doc>": continue else: for pair in map(lambda word: (current, word.lower()), WORD_RE.findall(line)): batch[pair] += 1 if len(batch) > batch_limit: for key, count in batch.iteritems(): sketch.add(key, count) batch.clear() counter.value += 1 if counter.value % 10000 == 0: print "Processed %s lines" % counter.value for key, count in batch.iteritems(): sketch.add(key, count) batch.clear() print "Process %d finished" % index return sketch.get_matrix()
def test_syntax_sugar(self): sketch = CountMinSketch(10, 5) self.assertEqual(sketch.query("a"), sketch["a"]) sketch.add("a") self.assertEqual(sketch.query("a"), sketch["a"])
class History(object): def __init__(self, n, m, d): # time counter (update for each new unit) self.t = 0 # n is number of CM sketches self.n = n # m is size of array for each hash function self.m = m # d is number of hash functions self.d = d # present is a count-min sketch containing # sub-unit time counts of indexes. self.present = CMSketch(m, d) # ready is a t/f value to determine whether or not # to use the present as a score while the aggregate weighted score # is being computed self.ready = False # use a CM-sketch to keep track of aggregate weighted score # A = sum{j = 1 to log T} (M^j / 2^j) # (we add the present ourselves) # keep track of A at every time interval # initialized to zero self.aggregate_score = CMSketch(m, d) # n count-min sketches # we retain resolutions 1, 2, 4, ..., 2^n # move to next sketch (update curr_sketch) when # time unit filled = 2^i (its position in the list) self.cm_sketch_list = [] for i in range(n): self.cm_sketch_list.append(CMSketch(m, d)) def update_present_only(self, datum): self.ready = False # don't update the full time # this is a sub-unit update self.present.add(datum, 1) # data_block is a block of data, presented as an iterable object # the block of data consists of data that arrived in a single time unit # implements algorithm 2 from the paper # this structures maintains n CM-sketches, M0, M1, ..., Mn # M0 always holds [t-1, t] where t is current time # M1 always holds [t - tmod2 - 2, t - tmod2] # ... # Mn always holds [t - tmod(2^n) - 2^n, t - tmod(2^n)] # for t = 8, for example: # M0: [7, 8] # M1: [6, 8] # M2: [4, 8] # M3: [0, 8] # rest: 0 def aggregate_unit(self, data_block): # update time once per unit self.t += 1 # we use this to keep track of the current time unit # convert the data_block into a CM sketch accumulator = CMSketch(self.m, self.d) # add each hashtag in the data_block to the CM sketch # while this data is coming in, we maintain a separate # data structure with the exact frequencies that we can # query for exact frequencies. # with frequency 1 for each appearance # reset the present when we aggregate the whole thing self.present = CMSketch(self.m, self.d) # (data_block is the present) for data in data_block: accumulator.add(data, 1) # update present as we update the accumulator self.present.add(data, 1) self.ready = False # we update the whole structure with M_bar # we calculate l: # l = max over all i such that (t mod 2^i) == 0 # efficient -- takes log t time to find at worst def find_l(t): l = 0 if t == 0: return l while t % 2 == 0: l += 1 t = t/2 return l # go up to the index that is find_l + 1, or the max index # if find_l + 1 >= to it for i in range(min(find_l(self.t) + 1, self.n)): # now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j) # M_bar - M^j difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1)) # A = A + (1/2)^i difference self.aggregate_score = sketch_sum(self.aggregate_score, sketch_scalar_product(difference, pow(0.5, i))) # temporary storage T = deepcopy(accumulator) # aggregate into accumulator for next round accumulator = sketch_sum(accumulator, self.cm_sketch_list[i]) # set the value self.cm_sketch_list[i] = T # now we're ready to use CM-sketch values self.ready = True # reset the present now that we're done with one time block self.present = CMSketch(self.m, self.d) # we want to put these values into its own count-min sketch, (call it A) # updated in sync so as to not waste log T time summing # for each query. # this value will provide a key for our heap def query_slow(self, x): return self.present.query(x) + sum(pow(0.5, i) * self.cm_sketch_list[i].query(x) for i in range(self.n)) # using a CMSketch to keep track of the score # note that we stored the 'scores' we calculated in CM-sketch # therefore it will pick the minimum of these # this is exactly equivalent to doing the sum over the minimums since we added termwise # (used matrix addition and scalar multiplication) def query(self, x): if self.ready: return self.aggregate_score.query(x) else: # only if we're not ready return self.present.query(x) + self.aggregate_score.query(x)
def test_add_greater_than_one(self): sketch = CountMinSketch(10, 5) sketch.add("a", 123) self.assertEqual(sketch.query("a"), 123)
def test_zero_at_start(self): sketch = CountMinSketch(10, 5) for thing in (0, 1, -1, tuple, tuple(), "", "yeah", object()): self.assertEqual(sketch.query(thing), 0)
def CountMin_Sketch(stream, k, h): sketch = CountMinSketch(k, h) for e in stream: sketch.add(e) return sketch
def test(): ''' basic testing method ''' print('build in memory check') cms = CountMinSketch(width=100000, depth=7) # add elements for i in range(100): tmp = 100 * (i + 1) cms.add(str(i), tmp) print(cms.check(str(0), 'min')) print(cms.check(str(0), 'mean')) print(cms.check(str(0), 'mean-min')) cms.export('./dist/py_test.cms') print('import from disk check') cmsf = CountMinSketch(filepath='./dist/py_test.cms') if cms.width != cmsf.width: print('width does not match!') if cms.depth != cmsf.depth: print('depth does not match!') print(cmsf.check(str(0), 'min')) print(cmsf.check(str(0), 'mean')) print(cmsf.check(str(0), 'mean-min')) try: print('\n\nTest invalid initialization') cms_ex = CountMinSketch() except SyntaxError as ex: print(ex)
processinfo_file = open(processinfo_filename, "w") processinfo_file.write('Sketch Width, No of hash functions') processinfo_file.write(',No of Trade Records, No of Stock Symbols') processinfo_file.write(',CrossRef Not Present Count, CrossRef Present Count') processinfo_file.write(',Not Present Count, Present Count,Error Count,Error %') processinfo_file.write(',CMS Time, Sketch Time, Sketch Save Time, Sketch Query Time, CMS Query Time') processinfo_file.write(',CMS Start Time\n') for no_of_record in no_of_records: stock_trade_filename = source_data_dir + stock_etf + "_trade_" + time_interval + str(no_of_record) + '.csv' crossref_not_present_count = 0 crossref_present_count = 0 #print('CrossRef CMS Process Starts Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f"))) #CrossRef CMS Process Starts crossref_stock_trade_frq_cms = CountMinSketch(100000, 10) stock_trade_file = open(stock_trade_filename,"r") stock_trade_lines = csv.reader(stock_trade_file, delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True) next(stock_trade_file) #print('CrossRef CMS Create Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f"))) #CrossRef CMS Create for stock_trade_line in stock_trade_lines: stock_symbol = stock_trade_line[0].strip() add1 = crossref_stock_trade_frq_cms.add(stock_symbol) stock_trade_file.close() #print('CrossRef CMS Membership Check Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f"))) #CrossRef CMS Membership Check check_stock_symbol_file = open(check_stock_symbol_filename, "r")
plt.plot(count_min_sketch, color='red', label="Count-Min Sketch") plt.legend() plt.show() plt.gcf().clear() plt.title('Plot with log scaling') plt.plot(actual_count, linewidth=5, label="Actual Count") plt.plot(count_min_sketch, color='red', label="Count-Min Sketch") plt.yscale('log') plt.legend() plt.show() tweets = [] for line in open('data/tweets.json.1', 'r', encoding='latin-1'): tweets.append(json.loads(line)) data = json_normalize(tweets) choices = [(100000, 10), (10000, 10), (1000, 10), (1000, 100), (1000, 5)] for i in choices: sketch = CountMinSketch(i[0], i[1]) print('\nData for CountMinSketch(', i[0], ',', i[1], ')\n') tag_count_df = data_sketching(data, sketch) metrics_and_plotting(tag_count_df) #sketch = CountMinSketch(10000, 10) # #tag_count_df = data_sketching(data,sketch) #metrics_and_plotting(tag_count_df)
def build_countminsketch(ksup, w=1000, h=10): """returns a countminsketch object inserting all kmers from given KmerSupplier.""" sketch = CountMinSketch(w, h) for kmer in ksup.iterkmers(): sketch.add(kmer) return sketch
def test_merge(): countmin1 = CountMinSketch(0.1, 0.1) countmin2 = CountMinSketch(0.1, 0.1) word = 'hello' countmin1.increment(word) countmin2.increment(word) countmin1.merge(countmin2) assert countmin2.estimate(word) == 1 assert countmin1.estimate(word) == 2 top_10_list = [('hello', 2.0)] assert set(countmin1.top_10_dict.items()) == set(top_10_list)
def test_bad_sketch(self): with self.assertRaises(ValueError): CountMinSketch(0, 10, seed=seeds) with self.assertRaises(ValueError): CountMinSketch(10, 0, seed=seeds)
packetport = "5556" bufferport = "5557" ctrlPlanePort = "5560" ctrlplane_ip = sys.argv[1] context = zmq.Context() buffersock = context.socket(zmq.REQ) buffersock.connect("tcp://localhost:%s" % bufferport) ctrlPlaneSock = context.socket(zmq.PUSH) ctrlPlaneSock.connect("tcp://" + ctrlplane_ip + ":%s" % ctrlPlanePort) depth = 10 width = 40000 hash_functions = [hash_function(i) for i in range(depth)] sketch = CountMinSketch(depth, width, hash_functions) lastUpd = time.clock() distinctFlows = [] distinctFlowsDelta = [] def has_ports(str): splitted = str.split(" ") if "->" in splitted: return True else: return False def get_port_type(str):
def test_bad_init(self): with self.assertRaises(ValueError): CountMinSketch(0, 5) with self.assertRaises(ValueError): CountMinSketch(100, 0)
stock_symbol_file.write(',Actual Trade Freq, APDS Trade Freq, Freq Accuracy') stock_symbol_file.write(',Actual Trade Volume, APDS Trade Volume, Vol Accuracy\n') #stock_input_filename = proj_dir + "stock_vol_files/" + stock_etf + time_interval + "_R" + str(no_of_record) + "_w" + str(width) + "_d" + str(depth) + "_input.csv" #stock_input_file = open(stock_input_filename, "w") #stock_input_file.write('Stock Symbol, Trade Date, Volume\n') apds_filename = proj_dir + "apds_files/" + stock_etf + time_interval + "_R" + str(no_of_record) + "_w" + str(width) + "_d" + str(depth) + "_freq.apds" stock_freq_dist = {} total_freq_accuracy = 0 stock_vol_dist = {} total_vol_accuracy = 0 stock_vol_apds = CountMinSketch(width, depth) stock_trade_record_count = 0 vol_sketch_time = 0 # add elements to sketch for stock_trade_line in stock_trade_lines: #print('stock_trade_record_count:',stock_trade_record_count,' no_of_record:',no_of_record) if stock_trade_record_count >= no_of_record: break stock_trade_record_count = stock_trade_record_count + 1 stock_symbol = stock_trade_line[0].strip() trade_date = stock_trade_line[1].strip() stock_vol = int(stock_trade_line[7].strip()) #stock_input_file.write(stock_symbol + "," + str(trade_date) + "," + str(stock_vol) + "\n") vol_sketch_starttime = time.process_time() apds_cmsadded = stock_vol_apds.add(stock_symbol,stock_vol)
def node_countminsketch(): sketch = CountMinSketch(6000, 10) return sketch
#- # Frequency using CSK #- from countminsketch import CountMinSketch # Initialize # table size=1000, hash functions=10 ds = CountMinSketch(1000, 10) # Add ds.add(1) ds.add(2) ds.add(1) # Test assert ds[1] == 2 assert ds[2] == 1 assert ds[3] == 0
import sys import csv from countminsketch import CountMinSketch # Setup the strean and some variables. item_set = set() currentCustomer='' # CountMinSketch with 20 hashes to try to prevent hash collision so that every product gets unique identity. As many hashes used, the more accurate is the result. So # maybe for biger data sets we need to increase the second argument of the CountMinSketch. # two CountMinSketch instances one for item count and one for total revenue # since productId is hashed and cannot be reverted back we need to store the products in item_set set data structure (doesn't contain duplicates) # while retrieving the counts and revenue we need to hash the strings stoted in item_set since same string is supposed to output the same hash. itemCount = CountMinSketch(10,20) itemRevenue = CountMinSketch(10,20) def salesRead(filename): with open(filename, 'r') as fi: reader = csv.DictReader(fi) for row in reader: yield(row) # Now get the stream of data and process it input_file = sys.argv[1] out_file = sys.argv[2] individual_cart = set() for hod in salesRead(input_file):