Exemplo n.º 1
0
 def _make_sketch(kmer_counts_dict: defaultdict) -> CountMinSketch:
     # Read the dictionary into a compressed data structure to allow deleting kmer_counts_dict
     NUM_ROWS = 8
     kmer_counts = CountMinSketch(NUM_ROWS)
     for kmer, count in kmer_counts_dict.items():
         kmer_counts.update(kmer, count)
     return kmer_counts
Exemplo n.º 2
0
def runner(input_file_name, config):
    # read the list of integers
    # with open(input_file_name) as f:
    #     lst = list(map(lambda l: int(l.strip()), f.readlines()))

    sketch = CountMinSketch(config["m"], config["d"])

    lst = AxProf.zipfGenerator(config["n"], config["skew"],
                               int(time.time() * 1000) % 2**32)

    # measure the running time
    startTime = time.time()
    for num in lst:
        sketch.add(num)
    endTime = time.time()

    actual_map = actual_count(lst)
    error_map = {}

    for num in set(lst):
        error_map[num] = abs(actual_map[num] - sketch[num])

    return {
        "input": lst,
        "acc": error_map,
        "time": config["m"] * config["d"],  # endTime - startTime,
        "space": 0,
    }
Exemplo n.º 3
0
def sketch_scalar_product(M, c):
	result_sketch = CMSketch(M.m, M.d)
	for i in range(M.d):
		for j in range(M.m):
			new_val = c*M.val_at(i, j)
			result_sketch.update(i, j, new_val)
	return result_sketch
Exemplo n.º 4
0
def runner(input_file_name, config):
    # read the list of integers
    with open(input_file_name) as f:
        lst = list(map(lambda l: int(l.strip()), f.readlines()))

    m = math.ceil(math.e / config["eps"])
    d = math.ceil(ln(1 / config["delta"]))

    # print(m, d)

    sketch = CountMinSketch(m, d)

    # measure the running time
    startTime = time.time()
    for num in lst:
        sketch.add(num)
    endTime = time.time()

    result = {}

    for num in lst:
        result[num] = sketch[num]

    return {
        "acc": result,
        "time": endTime - startTime,
        "space": 0,
    }
Exemplo n.º 5
0
	def __init__(self, n, m, d):
		# time counter (update for each new unit)
		self.t = 0
		# n is number of CM sketches
		self.n = n 
		# m is size of array for each hash function
		self.m = m
		# d is number of hash functions
		self.d = d
		# present is a count-min sketch containing
		# sub-unit time counts of indexes. 
		self.present = CMSketch(m, d)
		# ready is a t/f value to determine whether or not
		# to use the present as a score while the aggregate weighted score
		# is being computed
		self.ready = False
		# use a CM-sketch to keep track of aggregate weighted score
		# A = sum{j = 1 to log T} (M^j / 2^j)
		# (we add the present ourselves)
		# keep track of A at every time interval
		# initialized to zero
		self.aggregate_score = CMSketch(m, d)
		# n count-min sketches 
		# we retain resolutions 1, 2, 4, ..., 2^n
		# move to next sketch (update curr_sketch) when 
		# time unit filled = 2^i (its position in the list)
		self.cm_sketch_list = []
		for i in range(n):
			self.cm_sketch_list.append(CMSketch(m, d))
Exemplo n.º 6
0
    def __init__(self, delta, epsilon):
        self.m = int(math.ceil(math.exp(1) / epsilon))
        self.d = int(math.ceil(math.log(1 / delta)))

        CountMinSketch.__init__(self, m, d)

        self.bitarray = np.zeros((self.nbr_slices, self.bits_per_slice), dtype=np.int32)
        self.make_hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices)
 def test_counts_overestimate(self):
     text = open(__file__).read()
     counter = Counter(text)
     sketch = CountMinSketch(10, 5)
     for x in text:
         sketch.add(x)
     for x in set(text):
         self.assertGreaterEqual(sketch[x], counter[x])
 def test_simple_usage(self):
     N = 1000
     sketch = CountMinSketch(10, 5)
     for _ in xrange(N):
         sketch.add("a")
     self.assertEqual(sketch.query("a"), N)
     self.assertEqual(sketch.query("b"), 0)
     self.assertEqual(len(sketch), N)
Exemplo n.º 9
0
    def __init__(self, delta, epsilon):
        self.m = int(math.ceil(math.exp(1) / epsilon))
        self.d = int(math.ceil(math.log(1 / delta)))

        CountMinSketch.__init__(self, m, d)

        self.bitarray = np.zeros((self.nbr_slices, self.bits_per_slice),
                                 dtype=np.int32)
        self.make_hashes = generate_hashfunctions(self.bits_per_slice,
                                                  self.nbr_slices)
Exemplo n.º 10
0
	def aggregate_unit(self, data_block):
		# update time once per unit
		self.t += 1
		# we use this to keep track of the current time unit
		# convert the data_block into a CM sketch
		accumulator = CMSketch(self.m, self.d)
		# add each hashtag in the data_block to the CM sketch
		# while this data is coming in, we maintain a separate
		# data structure with the exact frequencies that we can
		# query for exact frequencies. 
		# with frequency 1 for each appearance

		# reset the present when we aggregate the whole thing
		self.present = CMSketch(self.m, self.d)
		# (data_block is the present)
		for data in data_block:
			accumulator.add(data, 1)
			# update present as we update the accumulator
			self.present.add(data, 1)

		self.ready = False
		# we update the whole structure with M_bar
		# we calculate l: 
		# l = max over all i such that (t mod 2^i) == 0
		# efficient -- takes log t time to find at worst
		def find_l(t):
			l = 0
			if t == 0:
				return l
			while t % 2 == 0:
				l += 1
				t = t/2
			return l

		# go up to the index that is find_l + 1, or the max index
		# if find_l + 1 >= to it
		for i in range(min(find_l(self.t) + 1, self.n)):
			# now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j)
			# M_bar - M^j
			difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1))
			# A = A + (1/2)^i difference
			self.aggregate_score = sketch_sum(self.aggregate_score, 
									sketch_scalar_product(difference, pow(0.5, i)))
			# temporary storage
			T = deepcopy(accumulator)
			# aggregate into accumulator for next round
			accumulator = sketch_sum(accumulator, self.cm_sketch_list[i])
			# set the value
			self.cm_sketch_list[i] = T
		# now we're ready to use CM-sketch values
		self.ready = True
		# reset the present now that we're done with one time block
		self.present = CMSketch(self.m, self.d)
Exemplo n.º 11
0
def simulate_sfcmon(df, num_chunks):
    print("#################### PROCESSING CHUNKS #########################")
    num_packets = df.shape[0]
    print("# Number of packets = {}".format(num_packets))
    chunks = split(df, num_packets / num_chunks)
    results = {"packets": set(), "size": set()}
    for c in chunks:
        print("# Chunk Data: {}; {}".format(c.shape, c.index))
        sketch_packets = CountMinSketch(
            5436, 5)  # table size=1000, hash functions=10
        sketch_size = CountMinSketch(5436,
                                     5)  # table size=1000, hash functions=10
        count_packets = 0
        count_size = 0
        for row in zip(c["SrcIP"], c["Size"]):
            flow_id = row[0]
            sketch_packets.add(flow_id)
            sketch_size.add(flow_id, value=row[1])
            count_packets += 1
            count_size += row[1]
            if count_packets > num_rows_to_start:
                hh_threshold_packets = count_packets * bound
                hh_threshold_size = count_size * bound
                if sketch_packets[flow_id] > hh_threshold_packets:
                    results["packets"].add(flow_id)
                if sketch_size[flow_id] > hh_threshold_size:
                    results["size"].add(flow_id)
    return results
Exemplo n.º 12
0
def show_statistics():
    while True:
        H1 = copy.deepcopy(H)
        distinctFlows1 = copy.deepcopy(distinctFlows)
        N1 = copy.deepcopy(N)
        depth = 10
        width = 40000
        hash_functions = [hash_function(i) for i in range(depth)]
        sketch1 = CountMinSketch(depth, width, hash_functions, M=N1)

        for fp_key in H1:
            ef = H1[fp_key][0]
            rf = H1[fp_key][1]
            df = H1[fp_key][2]
            sketch1.add(fp_key, rf + df + ef)

        time.sleep(1)
        sketch = CountMinSketch(depth, width, hash_functions, M=N)

        for fp_key in H:
            ef = H[fp_key][0]
            rf = H[fp_key][1]
            df = H[fp_key][2]
            sketch.add(fp_key, rf + df + ef)

        top_flows = get_top_flows(sketch, sketch1)

        system('clear')
        print " flow                      rate"
        for flow in top_flows:
            print "#" + str(flow[0]) + " :: " + str(flow[1]) + "b/s"
Exemplo n.º 13
0
def sketch_sum(M1, M2):
	if M1.m != M2.m:
		print "Sketches don't align on hashtable length.\n"
		return
	elif M1.d != M2.d:
		print "Sketches don't align on # of hashtables.\n"
		return
	else:
		result_sketch = CMSketch(M2.m, M2.d)
		for i in range(M2.d):
			for j in range(M2.m):
				new_val = M1.val_at(i, j) + M2.val_at(i, j)
				result_sketch.update(i, j, new_val)
		return result_sketch
Exemplo n.º 14
0
def test_compute_width_depth():
    delta = 0.1
    epsilon = 0.1
    depth, width = CountMinSketch.compute_depth_width(delta, epsilon)

    assert width == 2
    assert depth == 20
Exemplo n.º 15
0
def plot_error(lst, m, d, color=None, label=None):
    sketch = CountMinSketch(m, d)
    actual_map = actual_count(lst)

    for i in lst:
        sketch.add(i)

    errors = []
    unique = set(lst)

    for i in unique:
        actual = actual_map[i]
        error = abs(actual - sketch[i])
        errors.append(error)

    print(len(errors))

    plot.hist(errors, bins=len(errors) // 100, color=color, label=label)
    plot.xlim(0, int(len(lst) * 0.1))
Exemplo n.º 16
0
def simulate_rtp4mon(df, bound, pkts_to_start):
    print("# " + str(datetime.datetime.now()) +
          " - Begin simulation RTP4Mon...")
    results = {"packets": set(), "5t_packets": set()}
    sketch_packets = CountMinSketch(5436,
                                    5)  # table size=1000, hash functions=10
    count_packets = 0
    for row in zip(df["SrcIP"], df["DstIP"], df["SrcPort"], df["DstPort"],
                   df["Proto"]):
        flow_id = row[0]
        five_flow_id = row
        sketch_packets.add(flow_id)
        count_packets += 1
        if count_packets > pkts_to_start:
            hh_threshold_packets = count_packets * bound
            if sketch_packets[flow_id] > hh_threshold_packets:
                results["packets"].add(flow_id)
                results["5t_packets"].add(five_flow_id)
    print("# " + str(datetime.datetime.now()) + " - End simulation RTP4Mon...")
    return results
Exemplo n.º 17
0
def update_statistics():
    global N, H

    depth = 10
    width = 40000
    hash_functions = [hash_function(i) for i in range(depth)]
    sketch = CountMinSketch(depth, width, hash_functions, M=N)

    for fp_key in H:
        ef = H[fp_key][0]
        rf = H[fp_key][1]
        df = H[fp_key][2]
        sketch.add(fp_key, rf + df + ef)

    system('clear')
    flows_to_display = []
    for flow in distinctFlows:
        flows_to_display.append((flow, sketch.query(flow)))
    for flow in H.keys():
        flows_to_display.append((flow, sketch.query(flow)))

    top_flows = sorted(flows_to_display, key=lambda x: x[1],
                       reverse=True)[0:20]
    for flow in top_flows:
        print flow
    print "Total flows:" + str(len(distinctFlows) + len(H.keys()))
Exemplo n.º 18
0
    def _make_sketch(self, kmer_counts_dict: defaultdict) -> CountMinSketch:
        if self.print_runtime:
            print("\n>--- STARTING TO MAKE COUNTMIN SKETCH AT T = {:.2f} ---".
                  format(time.time() - self.start_time))

        # Read the dictionary into a compressed data structure
        NUM_ROWS = 10
        kmer_counts = CountMinSketch(NUM_ROWS)
        for i, (kmer, count) in enumerate(kmer_counts_dict.items()):
            if self.print_runtime and i % 50000 == 0:
                print(">Processed {0} kmers by time T={1:.2f}".format(
                    i,
                    time.time() - self.start_time))
            kmer_counts.update(kmer, count)

        if self.print_runtime:
            print(">FINISHED MAKING COUNTMIN SKETCH AT T = {:.2f}".format(
                time.time() - self.start_time))
        if self.print_syssizeof:
            print(">SIZE OF COUNTMIN SKETCH: {:,}".format(
                sys.getsizeof(kmer_counts)))
        return kmer_counts
Exemplo n.º 19
0
def test_increment_and_estimate():
    word1 = 'hello'
    word2 = 'world'
    word3 = 'other'
    countmin = CountMinSketch(0.1, 0.1)
    countmin.increment(word1)
    countmin.increment(word2)
    countmin.increment(word2)

    assert countmin.estimate(word3) == 0
    assert countmin.estimate(word1) == 1
    assert countmin.estimate(word2) == 2

    top_10_list = [('hello', 1.0), ('world', 2.0)]
    assert set(countmin.top_10_dict.items()) == set(top_10_list)
Exemplo n.º 20
0
def worker(index, path):
    global counter
    """
    :param index: the index of the dump this worker should work on.
    :return:
    """
    print "Process %d start processing" % index
    with open("%s/wiki_0%s" % (path, index), "r") as f:
        batch = Counter()
        batch_limit = 10000
        sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS)
        current = datetime.now().date()
        for line in f:
            # Extrat timestamp from header
            if line[:4] == "<doc":
                m = TIMESTEMP_RE.search(line)
                if m:
                    current = datetime.strptime(m.group(1),
                                                "%Y-%m-%dT%H:%M:%SZ").date()
                continue
            elif line[:5] == "</doc>":
                continue
            else:
                for pair in map(lambda word: (current, word.lower()),
                                WORD_RE.findall(line)):
                    batch[pair] += 1
            if len(batch) > batch_limit:
                for key, count in batch.iteritems():
                    sketch.add(key, count)
                batch.clear()

            counter.value += 1
            if counter.value % 10000 == 0:
                print "Processed %s lines" % counter.value

        for key, count in batch.iteritems():
            sketch.add(key, count)
        batch.clear()

    print "Process %d finished" % index
    return sketch.get_matrix()
Exemplo n.º 21
0
def worker(index, path):
    global counter
    """
    :param index: the index of the dump this worker should work on.
    :return:
    """
    print "Process %d start processing" % index
    with open("%s/wiki_0%s" % (path, index), "r") as f:
        batch = Counter()
        batch_limit = 10000
        sketch = CountMinSketch(DEPTH, WIDTH, HASH_FUNCTIONS)
        current = datetime.now().date()
        for line in f:
            # Extrat timestamp from header
            if line[:4] == "<doc":
                m = TIMESTEMP_RE.search(line)
                if m:
                    current = datetime.strptime(m.group(1), "%Y-%m-%dT%H:%M:%SZ").date()
                continue
            elif line[:5] == "</doc>":
                continue
            else:
                for pair in map(lambda word: (current, word.lower()), WORD_RE.findall(line)):
                    batch[pair] += 1
            if len(batch) > batch_limit:
                for key, count in batch.iteritems():
                    sketch.add(key, count)
                batch.clear()

            counter.value += 1
            if counter.value % 10000 == 0:
                print "Processed %s lines" % counter.value

        for key, count in batch.iteritems():
            sketch.add(key, count)
        batch.clear()

    print "Process %d finished" % index
    return sketch.get_matrix()
 def test_syntax_sugar(self):
     sketch = CountMinSketch(10, 5)
     self.assertEqual(sketch.query("a"), sketch["a"])
     sketch.add("a")
     self.assertEqual(sketch.query("a"), sketch["a"])
Exemplo n.º 23
0
class History(object):
	def __init__(self, n, m, d):
		# time counter (update for each new unit)
		self.t = 0
		# n is number of CM sketches
		self.n = n 
		# m is size of array for each hash function
		self.m = m
		# d is number of hash functions
		self.d = d
		# present is a count-min sketch containing
		# sub-unit time counts of indexes. 
		self.present = CMSketch(m, d)
		# ready is a t/f value to determine whether or not
		# to use the present as a score while the aggregate weighted score
		# is being computed
		self.ready = False
		# use a CM-sketch to keep track of aggregate weighted score
		# A = sum{j = 1 to log T} (M^j / 2^j)
		# (we add the present ourselves)
		# keep track of A at every time interval
		# initialized to zero
		self.aggregate_score = CMSketch(m, d)
		# n count-min sketches 
		# we retain resolutions 1, 2, 4, ..., 2^n
		# move to next sketch (update curr_sketch) when 
		# time unit filled = 2^i (its position in the list)
		self.cm_sketch_list = []
		for i in range(n):
			self.cm_sketch_list.append(CMSketch(m, d))

	def update_present_only(self, datum):
		self.ready = False
		# don't update the full time
		# this is a sub-unit update
		self.present.add(datum, 1)

	# data_block is a block of data, presented as an iterable object
	# the block of data consists of data that arrived in a single time unit
	# implements algorithm 2 from the paper
	# this structures maintains n CM-sketches, M0, M1, ..., Mn
	# M0 always holds [t-1, t] where t is current time
	# M1 always holds [t - tmod2 - 2, t - tmod2]
	# ...
	# Mn always holds [t - tmod(2^n) - 2^n, t - tmod(2^n)]
	# for t = 8, for example:
	# M0: [7, 8]
	# M1: [6, 8]
	# M2: [4, 8]
	# M3: [0, 8]
	# rest: 0
	def aggregate_unit(self, data_block):
		# update time once per unit
		self.t += 1
		# we use this to keep track of the current time unit
		# convert the data_block into a CM sketch
		accumulator = CMSketch(self.m, self.d)
		# add each hashtag in the data_block to the CM sketch
		# while this data is coming in, we maintain a separate
		# data structure with the exact frequencies that we can
		# query for exact frequencies. 
		# with frequency 1 for each appearance

		# reset the present when we aggregate the whole thing
		self.present = CMSketch(self.m, self.d)
		# (data_block is the present)
		for data in data_block:
			accumulator.add(data, 1)
			# update present as we update the accumulator
			self.present.add(data, 1)

		self.ready = False
		# we update the whole structure with M_bar
		# we calculate l: 
		# l = max over all i such that (t mod 2^i) == 0
		# efficient -- takes log t time to find at worst
		def find_l(t):
			l = 0
			if t == 0:
				return l
			while t % 2 == 0:
				l += 1
				t = t/2
			return l

		# go up to the index that is find_l + 1, or the max index
		# if find_l + 1 >= to it
		for i in range(min(find_l(self.t) + 1, self.n)):
			# now we want to add the appropriate value: A + 1/2^(i)(M_bar - M^j)
			# M_bar - M^j
			difference = sketch_sum(accumulator, sketch_scalar_product(self.cm_sketch_list[i], -1))
			# A = A + (1/2)^i difference
			self.aggregate_score = sketch_sum(self.aggregate_score, 
									sketch_scalar_product(difference, pow(0.5, i)))
			# temporary storage
			T = deepcopy(accumulator)
			# aggregate into accumulator for next round
			accumulator = sketch_sum(accumulator, self.cm_sketch_list[i])
			# set the value
			self.cm_sketch_list[i] = T
		# now we're ready to use CM-sketch values
		self.ready = True
		# reset the present now that we're done with one time block
		self.present = CMSketch(self.m, self.d)

	# we want to put these values into its own count-min sketch, (call it A)
	# updated in sync so as to not waste log T time summing
	# for each query.
	# this value will provide a key for our heap
	def query_slow(self, x):
		return self.present.query(x) + sum(pow(0.5, i) * self.cm_sketch_list[i].query(x) for i in range(self.n))

	# using a CMSketch to keep track of the score
	# note that we stored the 'scores' we calculated in CM-sketch
	# therefore it will pick the minimum of these
	# this is exactly equivalent to doing the sum over the minimums since we added termwise
	# (used matrix addition and scalar multiplication)
	def query(self, x):
		if self.ready:
			return self.aggregate_score.query(x)
		else: # only if we're not ready 
			return self.present.query(x) + self.aggregate_score.query(x)
 def test_add_greater_than_one(self):
     sketch = CountMinSketch(10, 5)
     sketch.add("a", 123)
     self.assertEqual(sketch.query("a"), 123)
 def test_zero_at_start(self):
     sketch = CountMinSketch(10, 5)
     for thing in (0, 1, -1, tuple, tuple(), "", "yeah", object()):
         self.assertEqual(sketch.query(thing), 0)
Exemplo n.º 26
0
def CountMin_Sketch(stream, k, h):
    sketch = CountMinSketch(k, h)
    for e in stream:
        sketch.add(e)

    return sketch
Exemplo n.º 27
0
def test():
    ''' basic testing method '''
    print('build in memory check')
    cms = CountMinSketch(width=100000, depth=7)
    # add elements
    for i in range(100):
        tmp = 100 * (i + 1)
        cms.add(str(i), tmp)

    print(cms.check(str(0), 'min'))
    print(cms.check(str(0), 'mean'))
    print(cms.check(str(0), 'mean-min'))
    cms.export('./dist/py_test.cms')

    print('import from disk check')
    cmsf = CountMinSketch(filepath='./dist/py_test.cms')
    if cms.width != cmsf.width:
        print('width does not match!')
    if cms.depth != cmsf.depth:
        print('depth does not match!')

    print(cmsf.check(str(0), 'min'))
    print(cmsf.check(str(0), 'mean'))
    print(cmsf.check(str(0), 'mean-min'))

    try:
        print('\n\nTest invalid initialization')
        cms_ex = CountMinSketch()
    except SyntaxError as ex:
        print(ex)
Exemplo n.º 28
0
processinfo_file = open(processinfo_filename, "w")
processinfo_file.write('Sketch Width, No of hash functions')
processinfo_file.write(',No of Trade Records, No of Stock Symbols')
processinfo_file.write(',CrossRef Not Present Count, CrossRef Present Count')
processinfo_file.write(',Not Present Count, Present Count,Error Count,Error %')
processinfo_file.write(',CMS Time, Sketch Time, Sketch Save Time, Sketch Query Time, CMS Query Time')
processinfo_file.write(',CMS Start Time\n')

for no_of_record in no_of_records:
    stock_trade_filename = source_data_dir + stock_etf + "_trade_" + time_interval + str(no_of_record) + '.csv'
    crossref_not_present_count = 0
    crossref_present_count = 0

    #print('CrossRef CMS Process Starts Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Process Starts
    crossref_stock_trade_frq_cms = CountMinSketch(100000, 10)
    stock_trade_file = open(stock_trade_filename,"r")
    stock_trade_lines = csv.reader(stock_trade_file, delimiter=',', quoting=csv.QUOTE_ALL, skipinitialspace=True)
    next(stock_trade_file)

    #print('CrossRef CMS Create Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Create
    for stock_trade_line in stock_trade_lines:
        stock_symbol = stock_trade_line[0].strip()
        add1 = crossref_stock_trade_frq_cms.add(stock_symbol)
    stock_trade_file.close()

    #print('CrossRef CMS Membership Check Time: ' + str(today.strftime("%X")) + ' ' + str(today.strftime("%f")))
    #CrossRef CMS Membership Check
    check_stock_symbol_file = open(check_stock_symbol_filename, "r")
Exemplo n.º 29
0
    plt.plot(count_min_sketch, color='red', label="Count-Min Sketch")
    plt.legend()
    plt.show()
    plt.gcf().clear()

    plt.title('Plot with log scaling')
    plt.plot(actual_count, linewidth=5, label="Actual Count")
    plt.plot(count_min_sketch, color='red', label="Count-Min Sketch")
    plt.yscale('log')
    plt.legend()
    plt.show()


tweets = []
for line in open('data/tweets.json.1', 'r', encoding='latin-1'):
    tweets.append(json.loads(line))
data = json_normalize(tweets)

choices = [(100000, 10), (10000, 10), (1000, 10), (1000, 100), (1000, 5)]

for i in choices:
    sketch = CountMinSketch(i[0], i[1])
    print('\nData for CountMinSketch(', i[0], ',', i[1], ')\n')
    tag_count_df = data_sketching(data, sketch)
    metrics_and_plotting(tag_count_df)

#sketch = CountMinSketch(10000, 10)
#
#tag_count_df = data_sketching(data,sketch)
#metrics_and_plotting(tag_count_df)
Exemplo n.º 30
0
def build_countminsketch(ksup, w=1000, h=10):
    """returns a countminsketch object inserting all kmers from given KmerSupplier."""
    sketch = CountMinSketch(w, h)
    for kmer in ksup.iterkmers():
        sketch.add(kmer)
    return sketch
Exemplo n.º 31
0
def test_merge():
    countmin1 = CountMinSketch(0.1, 0.1)
    countmin2 = CountMinSketch(0.1, 0.1)
    word = 'hello'
    countmin1.increment(word)
    countmin2.increment(word)
    countmin1.merge(countmin2)

    assert countmin2.estimate(word) == 1
    assert countmin1.estimate(word) == 2

    top_10_list = [('hello', 2.0)]
    assert set(countmin1.top_10_dict.items()) == set(top_10_list)
 def test_bad_sketch(self):
     with self.assertRaises(ValueError):
         CountMinSketch(0, 10, seed=seeds)
     with self.assertRaises(ValueError):
         CountMinSketch(10, 0, seed=seeds)
Exemplo n.º 33
0
packetport = "5556"
bufferport = "5557"
ctrlPlanePort = "5560"
ctrlplane_ip = sys.argv[1]
context = zmq.Context()

buffersock = context.socket(zmq.REQ)
buffersock.connect("tcp://localhost:%s" % bufferport)

ctrlPlaneSock = context.socket(zmq.PUSH)
ctrlPlaneSock.connect("tcp://" + ctrlplane_ip + ":%s" % ctrlPlanePort)

depth = 10
width = 40000
hash_functions = [hash_function(i) for i in range(depth)]
sketch = CountMinSketch(depth, width, hash_functions)

lastUpd = time.clock()
distinctFlows = []
distinctFlowsDelta = []


def has_ports(str):
    splitted = str.split(" ")
    if "->" in splitted:
        return True
    else:
        return False


def get_port_type(str):
 def test_bad_init(self):
     with self.assertRaises(ValueError):
         CountMinSketch(0, 5)
     with self.assertRaises(ValueError):
         CountMinSketch(100, 0)
Exemplo n.º 35
0
            stock_symbol_file.write(',Actual Trade Freq, APDS Trade Freq, Freq Accuracy')
            stock_symbol_file.write(',Actual Trade Volume, APDS Trade Volume, Vol Accuracy\n')

            #stock_input_filename = proj_dir + "stock_vol_files/" + stock_etf + time_interval + "_R" + str(no_of_record) + "_w" + str(width) + "_d" + str(depth) + "_input.csv"
            #stock_input_file = open(stock_input_filename, "w")
            #stock_input_file.write('Stock Symbol, Trade Date, Volume\n')

            apds_filename = proj_dir + "apds_files/" + stock_etf + time_interval + "_R" + str(no_of_record) + "_w" + str(width) + "_d" + str(depth) + "_freq.apds"

            stock_freq_dist = {}
            total_freq_accuracy = 0

            stock_vol_dist = {}
            total_vol_accuracy = 0

            stock_vol_apds = CountMinSketch(width, depth)
            stock_trade_record_count = 0
            vol_sketch_time = 0

            # add elements to sketch
            for stock_trade_line in stock_trade_lines:
                #print('stock_trade_record_count:',stock_trade_record_count,' no_of_record:',no_of_record)
                if stock_trade_record_count >= no_of_record: break
                stock_trade_record_count = stock_trade_record_count + 1
                stock_symbol = stock_trade_line[0].strip()
                trade_date = stock_trade_line[1].strip()
                stock_vol = int(stock_trade_line[7].strip())
                #stock_input_file.write(stock_symbol + "," + str(trade_date) + "," + str(stock_vol) + "\n")

                vol_sketch_starttime = time.process_time()
                apds_cmsadded = stock_vol_apds.add(stock_symbol,stock_vol)
Exemplo n.º 36
0
def node_countminsketch():
    sketch = CountMinSketch(6000, 10)
    return sketch
Exemplo n.º 37
0
#-
# Frequency using CSK
#-
from countminsketch import CountMinSketch
# Initialize
# table size=1000, hash functions=10
ds = CountMinSketch(1000, 10)
# Add
ds.add(1)
ds.add(2)
ds.add(1)

# Test
assert ds[1] == 2
assert ds[2] == 1
assert ds[3] == 0
Exemplo n.º 38
0
import sys
import csv
from countminsketch import CountMinSketch


#  Setup the strean and some variables.


item_set = set()
currentCustomer=''
# CountMinSketch with 20 hashes to try to prevent hash collision so that every product gets unique identity. As many hashes used, the more accurate is the result. So
# maybe for biger data sets we need to increase the second argument of the CountMinSketch.
# two CountMinSketch instances one for item count and one for total revenue
# since productId is hashed and cannot be reverted back we need to store the products in item_set set data structure (doesn't contain duplicates)
# while retrieving the counts and revenue we need to hash the strings stoted in item_set since same string is supposed to output the same hash.
itemCount = CountMinSketch(10,20)
itemRevenue = CountMinSketch(10,20)
def salesRead(filename):
    with open(filename, 'r') as fi:
        reader = csv.DictReader(fi)
        for row in reader:
            yield(row)


# Now get the stream of data and process it


input_file = sys.argv[1]
out_file = sys.argv[2]
individual_cart = set()
for hod in salesRead(input_file):