예제 #1
0
    def test_hll_counting(self):
        f = tempfile.NamedTemporaryFile(mode='r+b')
        test1 = HyperLogLogDB(fileobj=f, error_rate=self.error_rate)
        test1.add('test_key', 'test_val')

        f2 = tempfile.NamedTemporaryFile(mode='r+b')
        mfile = self.init_hll_file(f2)
        test2 = hll.HyperLogLog(self.error_rate, mfile)
        test2.add('test_val')

        self.assertEqual(test1.count('test_key'), 1)
        self.assertEqual(len(test2), 1)
예제 #2
0
파일: UI.py 프로젝트: DongCiLu/VehicleCnt
    est = (numpy.log(va0) + numpy.log(vb0) - \
            numpy.log(vs1 + va0 + vb0 -1)) / \
            numpy.log(1 - (1 / sz))

    return est


if __name__ == '__main__':
    minc_size = 1024
    hllc_exp_error = 0.02
    lpc_size = 4

    mc1 = mincount.MinCount(minc_size)
    mc2 = mincount.MinCount(minc_size)
    hll1 = hyperloglog.HyperLogLog(hllc_exp_error)
    hll2 = hyperloglog.HyperLogLog(hllc_exp_error)
    # just for naive implementation
    hll_total = hyperloglog.HyperLogLog(hllc_exp_error)
    lpc1 = lp_counters.LPCounter(lpc_size)
    lpc2 = lp_counters.LPCounter(lpc_size)

    for i in range(0, 10000):
        mc1.add(i)
        hll1.add(i)
        hll_total.add(i)
        lpc1.increment(i)

    for i in range(8000, 20000):
        mc2.add(i)
        hll2.add(i)
예제 #3
0
if __name__ == '__main__':
    # input_size = 1000000 # 1 million distinct vehicles
    input_size = 5000  # initial number of distinct vehicles
    total_vehicles = 254639386  # total number of possible vehicles
    lpc_size = 128 * 8192  # linear prob counter in KB size
    hllc_exp_error = 0.02  # preset error rate for hyperloglog counter
    minc_size = 1024  # k for min count

    lpc_error_list = []
    hllc_error_list = []

    while (input_size <= 10000000):
        print "---------------------------------"
        print "Input size: ", input_size
        lpc = lp_counters.LPCounter(lpc_size)
        hllc = hyperloglog.HyperLogLog(hllc_exp_error)
        mc = mincount.MinCount(minc_size)
        print 'lpc size: ', lpc.get_size()

        items = set()
        while len(items) < input_size:
            i = random.randrange(0, total_vehicles)
            while i in items:
                i = random.randrange(0, total_vehicles)
            items.add(i)

            lpc.increment(i)
            hllc.add(i)
            mc.add(i)

        lpc_count = lpc.current_count()
def main():
    max = 10
    hll_hashing = "sha256"
    changeBias = 0
    for i in range(0, 7):

        for j in range(1, 2):

            if (max < 100):
                change_bias = -1.5
                hashing = "blake2b"
            if (max > 100 and max <= 10000):
                change_bias = 0.5
                hashing = "sha256"
            if (max > 10000):
                hashing = "sha512"
                change_bias = 0.1
            file_name = str(max) + "data" + str(j) + ".txt"
            start1 = time.time()
            f = open(file_name, "r")
            content = f.read()

            num = content.split('\n')
            num.remove('')

            x1 = countDistinct(num)
            end1 = time.time()

            hyLog = hll.HyperLogLog()
            for n in num:
                hyLog.add(n)
            start = time.time()
            x = hyLog.card()
            end = time.time()

            hyLog_mod = HyperLogLog(0.01, hll_hashing, changeBias)
            for n in num:
                hyLog_mod.add(n)
            start_mod = time.time()
            x_mod = hyLog_mod.card()
            end_mod = time.time()

            print(hyLog.p, hyLog.m)
            print("\n")
            print("*************************************************")
            print("File Name - ", file_name)
            print("Number of Entries - ", max)
            print("\n")
            print("Brute Force - ")
            print("Cardinality: ", x1, "\tTimeTaken: ", (end1 - start1) * 1000)
            print("\n")
            print("Original  HLL - ")
            print("Cardinality: ", x, "\tTimeTaken: ", (end - start) * 1000,
                  "\nAccuracy: ", 100 - (abs(x - x1) / x1) * 100)
            print("\n")
            print("Modified HLL - ")
            print("Cardinality: ", x_mod, "\tTimeTaken: ",
                  (end_mod - start_mod) * 1000, "\nAccuracy: ",
                  100 - (abs(x_mod - x1) / x1) * 100)
            print("*************************************************")
            print("\n")
        max *= 10
예제 #5
0
         log(2 * size_a * exp_error))) * lpc_load_factor
 hllc_exp_error = exp_error
 minc_size = int(ceil(96 / (exp_error ** 2))) * \
         minc_load_factor
 # repeat the experiment several times with same setting
 lpc_a_error = 0
 hllc_a_error = 0
 mc_a_error = 0
 lpc_error = 0
 hllc_error = 0
 mc_error = 0
 for exp_i in range(repeat_cnt):
     # initiate counters
     lpc_a = lp_counters.LPCounter(lpc_size)
     lpc_b = lp_counters.LPCounter(lpc_size)
     hllc_a = hyperloglog.HyperLogLog(hllc_exp_error)
     hllc_b = hyperloglog.HyperLogLog(hllc_exp_error)
     # simplify solution for hll
     hllc_u = hyperloglog.HyperLogLog(hllc_exp_error)
     mc_a = mincount.MinCount(minc_size)
     mc_b = mincount.MinCount(minc_size)
     # create items
     common = size_a * js
     items_a = set()
     items_b = set()
     while len(items_a) < size_a:
         while True:
             i = random.randrange(0, total_vehicles)
             if i not in items_a:
                 items_a.add(i)
                 lpc_a.increment(i)