Exemplo n.º 1
0
class BalanedData:
    def __init__(self, filterSize, hashCount, clickedUsers):
        self.allData = []
        self.clickedCounter = len(clickedUsers)
        self.noClickedCounter = 0
        self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount)
        self.__addUsers(clickedUsers)

    def __addUsers(self, clickedUsersIds):
        for userId in clickedUsersIds:
            self.__addUser(userId)

    def __addUser(self, userId):
        self.collectedDataUsersFilter.add(userId)

    def addUserRow(self, userId, row):

        isCollected = self.collectedDataUsersFilter.contains(userId)

        if isCollected:
            self.allData.append(row)
        elif self.clickedCounter > self.noClickedCounter:
            self.__addUser(userId)
            self.noClickedCounter += 1
            self.allData.append(row)
Exemplo n.º 2
0
 def __init__(self, redisName, filterName):
     self.headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
     }
     self.start_queue = RdsQueue(redisName)
     self.filter = BloomFilter(filterName)
     self.lock_commit = Lock()
     self.logger = logger
     self.conn = conn
     self.cursor = self.conn.cursor()
Exemplo n.º 3
0
def run(seed):
    bf = BloomFilter(0.00001,1000000)   #初始化布隆过滤器
    queue = Queue.Queue(maxsize = 0)        #初始化URL队列
    urlCount = 0                        #初始化已得到URL变量
    urlList = []                        #初始化下载列表
    queue.put(seed)
    
    while(queue.empty() == False):
        currentURL = queue.get()
        urlList.append(currentURL)
        print 'currentURL',to_bytestring(currentURL)
        
        try:    #timeout处理
            html = urllib.urlopen(currentURL)
        except:
            continue

        bs_obj = BeautifulSoup(html,'html.parser')
        a_list = bs_obj.findAll('a') + bs_obj.findAll('img')

        for aa in a_list:
            if aa.attrs.get('href'):
                hrefStr = aa.attrs.get('href')
            else:
                hrefStr = aa.attrs.get('src')

            if hrefStr:
                hrefStr = is_relativeURL(hrefStr,currentURL)
                if hrefStr == -1:     #判断相对/绝对路径
                    continue
                if is_needURL(hrefStr) == True:         #判断是否需要抓取
                    if bf.is_element_exist(hrefStr) == False:   #布隆过滤
                        bf.insert_element(hrefStr)
                        print to_bytestring(hrefStr)
                        if is_resourceFile(hrefStr) == False:  #判断是否是资源文件
                            queue.put(hrefStr)
                    
                        urlList.append(hrefStr)
                        try:
                            downloadHtml(hrefStr)
                        except:
                                pass
                    urlCount = urlCount + 1
        print '所有--当前',urlCount,len(urlList)
Exemplo n.º 4
0
def sampleData(file1, file2, column):

    filter = BloomFilter(13419082, 23)

    firstUsersIds1 = userIds(file1, column)

    for user in firstUsersIds1:
        filter.add(str(user))

    firstUsersIds2 = userIds(file2, 'fc20')

    same = 0
    diff = 0
    for user in firstUsersIds2:
        if filter.contains(str(user)):
            same += 1
        else:
            diff += 1

    return same, diff
Exemplo n.º 5
0
 def __init__(self, transactions, items, numReduce):
     self.__numReduce = numReduce
     self.__num_transaction = len(transactions)
     self.__items = items
     self.__transaction_index_map = {}
     self.__listHashFunc = BloomFilter(
         int(self.__num_transaction / self.__numReduce), 0.05).listHashFunc
     transactionsHashed = list()
     for i in range(self.__num_transaction):
         transactionsHashed.append(
             list(map(lambda y: y(str(i)), self.__listHashFunc)))
     for hashed, transaction in zip(transactionsHashed, transactions):
         self.add_transaction(transaction, hashed)
Exemplo n.º 6
0
class SpiderMan:

    insert_sql = "insert into artist(artistId, artistName) values(%s,%s)"

    def __init__(self, redisName, filterName):
        self.headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
        }
        self.start_queue = RdsQueue(redisName)
        self.filter = BloomFilter(filterName)
        self.lock_commit = Lock()
        self.logger = logger
        self.conn = conn
        self.cursor = self.conn.cursor()

    # 发送请求
    def detailRequest(self, url, data, encode="utf-8"):
        """
        :param url:
        :param data:
        :param encode:
        :return: Request
        """
        try:
            resp = requests.post(url, data=data)
            resp.encoding = encode
            if resp.status_code == 200:
                return resp
        except Exception as e:
            print(f"爬取失败!{e}")

    # 生成16个随机字符
    def generate_random_strs(self, length):
        string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
        # 控制次数参数i
        i = 0
        # 初始化随机字符串
        random_strs = ""
        while i < length:
            e = random.random() * len(string)
            # 向下取整
            e = math.floor(e)
            random_strs = random_strs + list(string)[e]
            i = i + 1
        return bytes(random_strs, encoding="utf8")

    # AES加密
    def AESencrypt(self, msg, key):
        # 如果不是16的倍数则进行填充(paddiing)
        padding = 16 - len(msg) % 16
        # 这里使用padding对应的单字符进行填充
        msg = msg + padding * chr(padding)
        # 用来加密或者解密的初始向量(必须是16位)
        iv = '0102030405060708'
        aes = AES.new(key.encode("utf-8"),
                      IV=iv.encode("utf-8"),
                      mode=AES.MODE_CBC)
        # 加密后得到的是bytes类型的数据
        encryptedbytes = aes.encrypt(msg.encode("utf-8"))
        # 使用Base64进行编码,返回byte字符串
        encodestrs = base64.b64encode(encryptedbytes)
        # 对byte字符串按utf-8进行解码
        enctext = encodestrs.decode('utf-8')
        return enctext

    # RSA加密
    def RSAencrypt(self, randomstrs, key, f):
        # 随机字符串逆序排列
        string = randomstrs[::-1]
        # 将随机字符串转换成byte类型数据
        # text = bytes(string, encoding='utf8')
        seckey = int(codecs.encode(string, encoding='hex'), 16)**int(
            key, 16) % int(f, 16)
        return format(seckey, 'x').zfill(256)

    # 定义Post参数
    def postParam(self, detailId):
        """

        :param detailId:
        :return: dict(param)
        """
        return dict

    # 获取加密参数
    def get_params(self, detailId):
        d = self.postParam(detailId)
        # 固定值
        g = '0CoJUm6Qyw8W8jud'
        f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
        e = '010001'

        # 随机生成长度为16的随机字符串,可固定
        # i = self.generate_random_strs(16)
        i = 'kEhRsbVFNaFQEOaG'
        # 随机值通过RSA加密之后得到encSecKey的值,可固定
        # encSecKey = RSAencrypt(i, e, f)
        encSecKey = 'a200e63459c28899f38dd1866058664c3fc10567b9d72ef91378dedf3971075e45732976768705520ee58a55d0e2a3b72ff8fe351e16651af42d001e77cf1c823006a8974cb88c986d1525cfe71935db2ec7a1b3677dfc670dbcdc4e58820fc31ade511a79a8e910a28d542fd44b7f67958468bd41d73d2ade5268565ac9f5de'

        # 两次AES加密之后得到params的值
        enctext = self.AESencrypt(d, g)
        encText = self.AESencrypt(enctext, i)
        return encText, encSecKey

    #数据插入
    def insert_data(self, sql, detailList):
        try:
            #数据库操作锁
            self.lock_commit.acquire()
            self.cursor.executemany(sql, detailList)
            self.conn.commit()
            self.lock_commit.release()
            self.logger.info(f'insert {detailList[-1]} success')
        except Exception as e:
            self.logger.debug(e)

    # 页面解析
    def parse_detail(self, detailId):
        """
        :param detailId:
        :return: list
        """
        return

    #通用采集流程
    def getdetailInfo(self):
        p = 0
        detailList = []
        while True:
            if self.start_queue.queueLen():
                detailId = self.start_queue.pop()
                if self.filter.isContains(detailId):
                    self.logger.debug(f"{detailId} has been crawled")
                    continue
                self.filter.insert(detailId)
                try:
                    detailList.append(self.parse_detail(detailId))
                    p += 1
                    if p == 10 or self.start_queue.queueLen() == 0:
                        if detailList != []:
                            self.insert_data(self.insert_sql, detailList)
                            detailList = []
                        else:
                            self.logger.debug('no data to sql')
                        p = 0
                except Exception as e:
                    self.logger.debug(f"{detailId} {e}")
            elif detailList != []:
                self.insert_data(self.insert_sql, detailList)
                detailList = []
                break

    # 采集流程1
    def getdetailsInfo(self):
        p = 0
        detailList = []
        while True:
            try:
                if self.start_queue.queueLen():
                    detailId = self.start_queue.pop()
                    if self.filter.isContains(detailId):
                        self.logger.debug(f"{detailId} has been crawled")
                        continue
                    self.filter.insert(detailId)

                    detailList = self.parse_detail(detailId)
                    p += 1
                    if p == 10 or self.start_queue.queueLen() == 0:
                        if detailList != []:
                            self.insert_data(self.insert_sql, detailList)
                            detailList = []
                        else:
                            self.logger.debug('no data to sql')
                        p = 0

                elif detailList != []:
                    self.insert_data(self.insert_sql, detailList)
                    detailList = []
                    break
            except Exception as e:
                self.logger.debug(e)

    #多线程
    def multi_task(self, function, num):
        tasks = []
        for _ in range(num):
            thread = Thread(target=function)
            thread.start()
            tasks.append(thread)
        for th in tasks:
            th.join()

    def main(self, num):
        self.multi_task(self.getdetailInfo, num)
Exemplo n.º 7
0
 def add_transaction(self, transaction, hashed):
     for item in transaction:
         if item not in self.__transaction_index_map:
             self.__transaction_index_map[item] = BloomFilter(
                 int(self.__num_transaction / self.__numReduce), 0.05)
         self.__transaction_index_map[item].addHashed(hashed)
Exemplo n.º 8
0
def main():
    input_size = 10000
    fp_rate = 0.01

    count_size = 4

    bloom_filter = BloomFilter(input_size, fp_rate)
    start_time = time.time()
    for i in range(0, input_size):
        bloom_filter.add(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in bloom_filter:
            fp_count += 1
    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size

    print("Expected false positive rate for all calculations is :" +
          str(fp_rate))
    print()
    print("For Standard Bloom Filter : \nFalse positive count:" +
          str(fp_count) + "  in " + str(input_size) + " try. " +
          str((fp_count / input_size)) + " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(bloom_filter) +
        bloom_filter.get_bitarray_size()))

    shifting_bloom_filter = ShiftingBloomFilterM(input_size, fp_rate)
    start_time = time.time()
    for i in range(0, input_size):
        shifting_bloom_filter.add(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in shifting_bloom_filter:
            fp_count += 1
    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size

    print()
    print("For Shifting Bloom Filter : \nFalse positive count:" +
          str(fp_count) + "  in " + str(input_size) + " try. " +
          str((fp_count / input_size)) + " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(shifting_bloom_filter) +
        shifting_bloom_filter.get_bitarray_size()))

    counting_bloom_filter = CountingBloomFilter(input_size,
                                                fp_rate,
                                                count_size=count_size)

    start_time = time.time()
    for i in range(0, input_size):
        counting_bloom_filter.add(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in counting_bloom_filter:
            fp_count += 1

    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    for i in range(0, input_size):
        if not str(i) in counting_bloom_filter:
            print(str(i))
    print()
    print("For counting filter :\nFalse positive count:" + str(fp_count) +
          "  in " + str(input_size) + " try. " + str((fp_count / input_size)) +
          " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(counting_bloom_filter) +
        counting_bloom_filter.get_bitarray_size()))

    scalable_bloom_filter = ScalableBloomFilter(
        fp_prob=fp_rate, growth=ScalableBloomFilter.SMALL_GROWTH)

    start_time = time.time()
    for i in range(0, input_size):
        scalable_bloom_filter.add(str(i))

    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in scalable_bloom_filter:
            fp_count += 1

    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    print()
    print("For scalable filter :\nFalse positive count:" + str(fp_count) +
          "  in " + str(input_size) + " try. " + str((fp_count / input_size)) +
          " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))

    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(scalable_bloom_filter) +
        scalable_bloom_filter.get_bitarray_size()))

    c_scalable_bloom_filter = ScalableBloomFilter(
        fp_prob=fp_rate,
        growth=ScalableBloomFilter.SMALL_GROWTH,
        countable=True,
        count_size=count_size)

    start_time = time.time()
    for i in range(0, input_size):
        c_scalable_bloom_filter.add(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    fp_count = 0
    for i in range(input_size, input_size * 2):
        if str(i) in c_scalable_bloom_filter:
            fp_count += 1

    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    print()
    print("For counting scalable filter :\nFalse positive count:" +
          str(fp_count) + "  in " + str(input_size) + " try. " +
          str((fp_count / input_size)) + " rate of false positive")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))

    print("Memory usage in bytes :" + str(
        memory_usage.get_obj_size(c_scalable_bloom_filter) +
        c_scalable_bloom_filter.get_bitarray_size()))

    size_sum = 0
    filled_bit_count = 0
    max_count = 0
    for a in c_scalable_bloom_filter.bloom_filters:
        for i in range(0, len(a)):
            if counting_bloom_filter.get_bit_value(i) > 0:
                size_sum += counting_bloom_filter.get_bit_value(i)
                filled_bit_count += 1
                if max_count < counting_bloom_filter.get_bit_value(i):
                    max_count = counting_bloom_filter.get_bit_value(i)

    avg_size = size_sum / filled_bit_count
    print("For counting filter -------- avg count:" + str(avg_size))
    print("For counting filter-------- max count:" + str(max_count))

    hasmap = {}
    start_time = time.time()
    for i in range(0, input_size):
        hasmap[str(i)] = i
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    for i in range(input_size, input_size * 2):
        if str(i) in hasmap:
            pass
    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    print()
    print("For Hashmap ")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(memory_usage.get_obj_size(hasmap)))

    py_list = []
    start_time = time.time()
    for i in range(0, input_size):
        py_list.append(str(i))
    end_time = time.time()
    avg_add_time = (end_time - start_time) / input_size

    start_time = time.time()
    for i in range(input_size, input_size * 2):
        if str(i) in py_list:
            pass
    end_time = time.time()
    avg_lookup_time = (end_time - start_time) / input_size
    print()
    print("For List ")
    print("Avg lookup time :" + str('{:.20f}'.format(avg_lookup_time)) +
          "  Avg add time:" + str('{:.20f}'.format(avg_add_time)))
    print("Memory usage in bytes :" + str(memory_usage.get_obj_size(py_list)))

    temp = 0
    for i in c_scalable_bloom_filter.bloom_filters:
        temp += memory_usage.get_obj_size(i)
    print(
        "aaa" +
        str(memory_usage.get_obj_size(c_scalable_bloom_filter.bloom_filters)))
    print("xxx" + str(temp))
Exemplo n.º 9
0
 def __init__(self, filterSize, hashCount, clickedUsers):
     self.allData = []
     self.clickedCounter = len(clickedUsers)
     self.noClickedCounter = 0
     self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount)
     self.__addUsers(clickedUsers)
import socket
import random
import os
from sample_data import USERS
from server_config import NODES
from pickle_hash import serialize_GET, serialize_PUT, serialize_DELETE
from node_ring import NodeRing
from lru_cache import Lru_Node, Lru_Cache
from bloomFilter import BloomFilter

BUFFER_SIZE = 1024
hash_codes = set()
has_cache = False
lru_cache_obj = Lru_Cache(0)
lru_cache_initialized = False
bf = BloomFilter(10, 0.05)


class UDPClient():
    def __init__(self, host, port):
        self.host = host
        self.port = int(port)

    def send(self, request):
        print('Connecting to server at {}:{}:{}'.format(
            self.host, self.port, os.getpid()))
        try:
            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
            s.sendto(request, (self.host, self.port))
            response, ip = s.recvfrom(BUFFER_SIZE)
            return response