class BalanedData: def __init__(self, filterSize, hashCount, clickedUsers): self.allData = [] self.clickedCounter = len(clickedUsers) self.noClickedCounter = 0 self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount) self.__addUsers(clickedUsers) def __addUsers(self, clickedUsersIds): for userId in clickedUsersIds: self.__addUser(userId) def __addUser(self, userId): self.collectedDataUsersFilter.add(userId) def addUserRow(self, userId, row): isCollected = self.collectedDataUsersFilter.contains(userId) if isCollected: self.allData.append(row) elif self.clickedCounter > self.noClickedCounter: self.__addUser(userId) self.noClickedCounter += 1 self.allData.append(row)
def sampleData(file1, file2, column): filter = BloomFilter(13419082, 23) firstUsersIds1 = userIds(file1, column) for user in firstUsersIds1: filter.add(str(user)) firstUsersIds2 = userIds(file2, 'fc20') same = 0 diff = 0 for user in firstUsersIds2: if filter.contains(str(user)): same += 1 else: diff += 1 return same, diff
def main(): input_size = 10000 fp_rate = 0.01 count_size = 4 bloom_filter = BloomFilter(input_size, fp_rate) start_time = time.time() for i in range(0, input_size): bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print("Expected false positive rate for all calculations is :" + str(fp_rate)) print() print("For Standard Bloom Filter : \nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(bloom_filter) + bloom_filter.get_bitarray_size())) shifting_bloom_filter = ShiftingBloomFilterM(input_size, fp_rate) start_time = time.time() for i in range(0, input_size): shifting_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in shifting_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For Shifting Bloom Filter : \nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(shifting_bloom_filter) + shifting_bloom_filter.get_bitarray_size())) counting_bloom_filter = CountingBloomFilter(input_size, fp_rate, count_size=count_size) start_time = time.time() for i in range(0, input_size): counting_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in counting_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size for i in range(0, input_size): if not str(i) in counting_bloom_filter: print(str(i)) print() print("For counting filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(counting_bloom_filter) + counting_bloom_filter.get_bitarray_size())) scalable_bloom_filter = ScalableBloomFilter( fp_prob=fp_rate, growth=ScalableBloomFilter.SMALL_GROWTH) start_time = time.time() for i in range(0, input_size): scalable_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in scalable_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For scalable filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(scalable_bloom_filter) + scalable_bloom_filter.get_bitarray_size())) c_scalable_bloom_filter = ScalableBloomFilter( fp_prob=fp_rate, growth=ScalableBloomFilter.SMALL_GROWTH, countable=True, count_size=count_size) start_time = time.time() for i in range(0, input_size): c_scalable_bloom_filter.add(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() fp_count = 0 for i in range(input_size, input_size * 2): if str(i) in c_scalable_bloom_filter: fp_count += 1 end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For counting scalable filter :\nFalse positive count:" + str(fp_count) + " in " + str(input_size) + " try. " + str((fp_count / input_size)) + " rate of false positive") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str( memory_usage.get_obj_size(c_scalable_bloom_filter) + c_scalable_bloom_filter.get_bitarray_size())) size_sum = 0 filled_bit_count = 0 max_count = 0 for a in c_scalable_bloom_filter.bloom_filters: for i in range(0, len(a)): if counting_bloom_filter.get_bit_value(i) > 0: size_sum += counting_bloom_filter.get_bit_value(i) filled_bit_count += 1 if max_count < counting_bloom_filter.get_bit_value(i): max_count = counting_bloom_filter.get_bit_value(i) avg_size = size_sum / filled_bit_count print("For counting filter -------- avg count:" + str(avg_size)) print("For counting filter-------- max count:" + str(max_count)) hasmap = {} start_time = time.time() for i in range(0, input_size): hasmap[str(i)] = i end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() for i in range(input_size, input_size * 2): if str(i) in hasmap: pass end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For Hashmap ") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str(memory_usage.get_obj_size(hasmap))) py_list = [] start_time = time.time() for i in range(0, input_size): py_list.append(str(i)) end_time = time.time() avg_add_time = (end_time - start_time) / input_size start_time = time.time() for i in range(input_size, input_size * 2): if str(i) in py_list: pass end_time = time.time() avg_lookup_time = (end_time - start_time) / input_size print() print("For List ") print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) + " Avg add time:" + str('{:.20f}'.format(avg_add_time))) print("Memory usage in bytes :" + str(memory_usage.get_obj_size(py_list))) temp = 0 for i in c_scalable_bloom_filter.bloom_filters: temp += memory_usage.get_obj_size(i) print( "aaa" + str(memory_usage.get_obj_size(c_scalable_bloom_filter.bloom_filters))) print("xxx" + str(temp))