Пример #1
0
 def __init__(self, depth: int, max_url_nums: int, cookies: str, exclude_urls: List[str],
              domain_reg_list: List[str],
              path_dicts: List[str], header: dict = None):
     self.domain_reg = ''
     self.domain_reg_list = domain_reg_list
     self.complement = 0
     self.depth = 5 if not depth else depth
     self.max_url_nums = 5000 if not max_url_nums else max_url_nums
     self.cookie = cookies
     self.exclude_urls = [url.replace('*', '\\S*') for url in exclude_urls]
     self.url_dict = dict()
     self.url_cache = BloomFilter(element_num=max_url_nums * 5, error_rate=0.01)
     self.current_depth = 0
     self.current_crawl_queue = list()
     self.next_crawl_queue = list()
     self.max_queue_length = self.max_url_nums + 1000
     self.header = header
     self.path_dicts = path_dicts
     self.header = header
     self.filter_exts = [
         'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
         'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
         'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
         'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
         'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
         'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp'
     ]
     self.exclude_urls_reg_str = ''
Пример #2
0
def test_speed():
    
    n = 10000
    p = 0.0001
    b = BloomFilter(n, p)
    print b
    
    strings = set()
    string_size = 20
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
        strings.add(string)
    
    total_time = 0
    starttime = time.time()
    for string in strings:
    	b.add(string)
    total_time = (time.time() - starttime)
    	
    ns = float(len(strings))
    k = float(b.k)
    total_time = float(total_time)
    
    print "Number of hash functions: %d" % b.k
    print "Speed per hash: %f seconds" % (total_time / ns / k)
    print "Speed per add: %f seconds" % (total_time / ns)
Пример #3
0
 def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
              depth: int = 5):
     self.cookie = cookie
     self.headers = headers if headers else DEFAULT_HEADERS
     self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
     self.current_queue = Manager().Queue(maxsize=max_num * 2)
     self.max_url_num = max_num
     self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
     self.url_dict = Manager().dict()
     self.domain_reg_list = domain_regs
     self.depth = depth
     self.current_depth = 0
     self.filter_exts = [
         'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
         'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
         'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
         'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
         'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
         'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
     ]
Пример #4
0
def test_error_rate():
    n = 10000
    p = 0.001
    b = BloomFilter(n, p)
    print "Creating BloomFilter for %d elements and false positive probability = %f ..." % (n, p)
    print "Optimal values are m = %d, k = %d" % (b.m, b.k)
    elt = 'apple'
    
    print "Testing..."
    assert elt not in b
    
    print "After adding '%s'..." % elt
    b.add(elt)
    
    print "Testing..."
    assert elt in b
    
    # create random strings
    strings = set()
    string_size = 20
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
    	strings.add(string)
    
    # other strings
    other_strings = set()
    for i in range(n):
    	string = ""
    	for j in range(string_size):
    		string += chr(random.randint(0, 255))
    	other_strings.add(string)
    
    # add all to set
    for s in list(strings):
    	b.add(s)
    
    # test for collisions
    other_strings = other_strings - strings
    collisions = 0
    for s in list(other_strings):
    	if s in b:
    		collisions += 1
    
    print "False positive rate was %d / %d = %f" % (
    	collisions, len(other_strings), 
    	float(collisions) / float(len(other_strings)))
Пример #5
0
 def setUp(self):
     self.bf = BloomFilter(0.001,10**3)
Пример #6
0
class TestBloomFilter(unittest.TestCase):

    def setUp(self):
        self.bf = BloomFilter(0.001,10**3)

    def test_init(self):
        self.assertTrue(isinstance(self.bf.bit_array,bitarray.bitarray))
        self.assertTrue(self.bf.error_rate>0)
        self.assertTrue(all(is_prime(i)for i in self.bf.seeds))
        self.assertTrue(self.bf.bit_array.count() == 0)

    def test_add(self):
        self.assertFalse(self.bf.add(12))
        self.assertFalse(self.bf.add(-12))
        self.assertFalse(self.bf.add(12.0))
        self.assertFalse(self.bf.add('12'))
        self.assertFalse(self.bf.add([12]))
        self.assertFalse(self.bf.add((12,)))
        self.assertFalse(self.bf.add({12:''}))
        self.assertFalse(self.bf.add(b'12'))
        self.assertFalse(self.bf.add(type('12',(),{})))

        self.assertTrue(self.bf.add(12))
        self.assertTrue(self.bf.add(-12))
        self.assertTrue(self.bf.add(12.0))
        self.assertTrue(self.bf.add('12'))
        self.assertTrue(self.bf.add([12]))
        self.assertTrue(self.bf.add((12,)))
        self.assertTrue(self.bf.add({12: ''}))
        self.assertTrue(self.bf.add(b'12'))
        self.assertTrue(self.bf.add(type('12', (), {})))

    def test_exists(self):
        self.bf.add(12)
        self.assertTrue((12 in self.bf))
        self.assertFalse('12' in self.bf)

    def test_tofile(self):
        self.assertFalse(self.bf.tofile(r'test\bf.bf'))

    def test_fromfile(self):
        self.assertTrue(isinstance(self.bf.fromfile(r'test\bf.bf'),BloomFilter))
Пример #7
0
class Crawler(object):
    def __init__(self, cookie: str = None, headers: dict = None, max_num: int = 10000, domain_regs: list = None,
                 depth: int = 5):
        self.cookie = cookie
        self.headers = headers if headers else DEFAULT_HEADERS
        self.waiting_queue = Manager().Queue(maxsize=max_num * 2)
        self.current_queue = Manager().Queue(maxsize=max_num * 2)
        self.max_url_num = max_num
        self.crawled_urls = BloomFilter(element_num=max_num * 5, error_rate=0.01)
        self.url_dict = Manager().dict()
        self.domain_reg_list = domain_regs
        self.depth = depth
        self.current_depth = 0
        self.filter_exts = [
            'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
            'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
            'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
            'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
            'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
            'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp', 'js'
        ]

    def run(self, urls):
        self.consist_headers()
        # 默认只爬取当前根域下的url
        self.domain_reg_list = self.parse_domain(urls) if not self.domain_reg_list else self.domain_reg_list
        self._init_reg()
        for url in urls:
            self.call_crawl_handler(url)
        print('all task done')
        print(self.url_dict)

    def call_crawl_handler(self, url):
        if 'http' not in url:
            init_url = 'http://' + url
        else:
            init_url = url
        self.current_queue.put_nowait(init_url)
        # 初始化url为了避免重复爬取,在初始化时就放入布隆过滤器
        init_url_without_protocol = url.split('//')[-1]
        self.crawled_urls.add(init_url_without_protocol)
        while self.current_depth < self.depth:
            if len(self.url_dict) >= self.max_url_num:
                print('达到预设爬去上限, 爬虫结束')
                break
            print('now crawl depth is :{}'.format(self.current_depth))
            tmp_results = []
            # 利用进程池去完成爬虫
            pool = Pool(os.cpu_count() * 2)
            while not self.current_queue.empty():
                print('当前在待爬队列中还有:{}个url'.format(self.current_queue.qsize()))
                url = self.current_queue.get_nowait()
                if not url.endswith('js'):
                    result = pool.apply_async(func=self.crawl_handler, args=(url,))
                    tmp_results.append(result)
                    # self.crawl_handler(url)
            pool.close()
            pool.join()
            tmp_reqs = []
            for result in tmp_results:
                for r in result.get():
                    tmp_reqs.append(r)
            self._handle_url(tmp_reqs)
            self.current_queue = self.waiting_queue
            self.waiting_queue = Manager().Queue(maxsize=self.max_url_num * 2)
            self.current_depth += 1
            print('depth:{} crawled done'.format(self.current_depth))

    def consist_headers(self):
        if self.cookie:
            self.headers['Cookie'] = self.cookie

    @staticmethod
    def parse_domain(domain_list):
        """
        需要将输入的url或者域名解析成域名, 用于后续同域判断等操作
        :param domain_list:
        :return:
        """

        def _split_url_protocol_and_path(domain):
            # 去掉协议
            if '://' in domain:
                domain = domain.split('://')[1]
            # 截取路径
            if '.com.cn' in domain:
                return domain.split('.com.cn')[0] + '.com.cn'
            if '.com' in domain:
                return domain.split('.com')[0] + '.com'
            if '.io' in domain:
                return domain.split('.xyz')[0] + '.io'
            # 针对ip:port形式的url,截取/即可
            return domain.split('/')[0]

        return [_split_url_protocol_and_path(domain) for domain in domain_list]

    @staticmethod
    def _parse_post_data(post_data) -> str:
        """
        解析动态请求获取里面的data成一个字符串
        :param post_data:
        :return:
        """
        if not post_data:
            return ''
        if not isinstance(post_data, dict):
            if '=' in post_data:
                param_dict = {}
                if '&' in post_data:
                    params_couples = post_data.split('&')
                    for param in params_couples:
                        if '=' not in param:
                            continue
                        k, v = param.split('=')
                        param_dict[k] = v
                else:
                    k, v = post_data.split('=')
                    param_dict[k] = v
                post_data = param_dict
            else:
                post_data = json.loads(post_data)
        post_data_list = [k for k, _ in post_data.items()]
        post_data_list.sort()
        return ''.join([param + '&' for param in post_data_list])[:-1]

    @staticmethod
    def parse_static_url(url):
        """
        把解析到的静态url, 重新组合成一个字典
        {
            'url': 'xxxxxx',
            'originUrl': 'xxxxxx/a=aa',
            'method': 'GET',
            'queryString': 'a=aa'
        }
        :param url:
        :return:
        """
        try:
            req = dict()
            req['method'] = 'GET'
            req['originUrl'] = url
            if '?' not in url:
                req['url'] = url
                return req
            url_consist = url.split('?')
            req['url'] = url_consist[0]
            params = url_consist[1]
            if '&' not in params:
                params_consist = params.split('=')
                req['queryString'] = params_consist[0] if params_consist[0] else ''
                return req
            multi_params = params.split('&')
            params_list = list(map(lambda y: y.split('=')[0], filter(lambda x: '=' in x, multi_params)))
            # 按首字母把参数排序
            params_list.sort()
            req['queryString'] = ''.join([key + '=&' for key in params_list])[:-2]
            return req
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return None

    def _init_reg(self):
        """
        根据解析出来的域名拼接一个正则, 用于同域校验
        :return:
        """
        domain_reg = ['^']
        domain_reg.extend(['(http|https):\/\/' + domain.replace('.', '\.') + '.*|' for domain in self.domain_reg_list])
        # domain_reg.extend(map(lambda x: '(http|https):\/\/' + x.replace('.', '\.') + '.*|', self.domain_reg_list))
        tmp_domain_reg = ''.join(domain_reg)
        self.domain_reg = tmp_domain_reg[:-1] + '$'

    def filter_ext(self, url):
        """
        过滤掉特殊后缀的url, 如一些静态资源等等
        如果存在url的后缀是需要排除的,则排除
        :param url:
        :return:
        """
        try:
            f = url.split('/')[-1].strip()
            if '.' in f:
                ext = f.split('.')[-1].strip().lower()
                if ext and ext in self.filter_exts:
                    return True
                else:
                    return False
            return False
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return False

    def filter_url_by_domain(self, url):
        """
        检验当前的url是否满足条件
        是  的url以及不在[不需要]的url集合里,返回True.不满足要求,返回false
        :param url:
        :return:
        """
        # 校验域名
        if not re.match(self.domain_reg, url, flags=0):
            return False
        # TODO: 后续补齐这部分功能
        # if len(self.exclude_urls) == 0:
        #    return True
        # 校验exclude_urls
        # if re.match(self.exclude_urls_reg_str, url, flags=0):
        #    return False
        return True

    def static_crawler(self, page, results, url) -> List["ElementHandle"]:
        """
        主要用于页面中静态url的解析, 目前涵盖了a标签的href属性和src属性
        """
        links = page.query_selector_all("//a")
        tmp_link = []
        for link in links:
            href = link.get_property("href").json_value()
            src = link.get_property("src").json_value()
            if not href or href == url:
                continue
            if not self.filter_ext(url=href) and self.filter_url_by_domain(url=href):
                req = self.parse_static_url(href)
                if req:
                    print('href:{}'.format(req))
                    results.append(req)

            if not src or src == url:
                continue
            if not self.filter_ext(url=src) and self.filter_url_by_domain(url=src):
                req = self.parse_static_url(src)
                if req:
                    print('src:{}'.format(req))
                    results.append(req)

            # 这里主要是用于有些a标签里的写法是<javascript>标签,用于执行某些js操作
            if 'javascript' in href or 'javascript' in src:
                tmp_link.append(link)
        return tmp_link

    def _check_crawled_url(self, url) -> bool:
        """
        检查是否已爬取,不存在,则返回True
        :param url:
        :return:
        """
        if url in self.crawled_urls:
            return False
        return True

    def _check_url_is_exist_by_md5(self, url_dict):
        """
        利用MD5去检查url是否重复
        :param url_dict:
        :return:
        """
        try:
            exist_md5 = list(url_dict.keys())[0]
            if exist_md5 in self.crawled_urls:
                return False
            return True
        except Exception:
            msg = traceback.format_exc()
            print(msg)
            return True

    @staticmethod
    def calculate_md5(url_har):
        """
        计算md5来去重
        :param url_har:
        :return:
        """
        url = url_har['url']
        # 有些post请求后缀会加timestamp时间戳来防重放
        tmp_list = url.split('//')[-1].split('?')
        url_without_protocol = tmp_list[0] if len(tmp_list) > 1 else url.split('//')[-1]
        method = url_har['method']
        query_string = ''
        post_data = ''
        if 'queryString' in url_har:
            query_string = url_har['queryString']
        if 'postData' in url_har:
            post_data = url_har['postData']
        tmp_str = url_without_protocol + '&' + method + '&' + query_string + post_data
        return hashlib.md5(tmp_str.encode('utf-8')).hexdigest()

    def _handle_url(self, req_list):
        """
        处理爬到的url, 看看是不是需要过滤或者是不是已经爬取过了
        :param req_list:
        :return:
        """
        if not req_list:
            return
        insert_req_list = list()
        for req in req_list:
            url = req['originUrl']
            if url.endswith('/'):
                url = url[:-1]
            url_without_protocol = url.split('//')[-1]
            '''
            解析完成后,返回的结构体包括:url,queryString(if exist),method
            需要对url做判断:
            1、是否存在于最后的url集合里
            2、是否已爬过
            3、url的后缀是否在需要过滤的集合里(最先判断,如果需要过滤则直接忽略)
            '''
            md5 = self.calculate_md5(req)
            tmp_dict = {
                md5: req
            }
            if self._check_url_is_exist_by_md5(tmp_dict):
                if len(self.url_dict.keys()) < self.max_url_num:
                    self.url_dict[md5] = req
                    # TODO:后面可以定制化插入taskId
                    insert_req_list.append({'taskId': 'test12', 'urlDict': json.dumps(req)})
                # 如果没有爬过,则放入下一轮要爬取的队列里
                if self._check_crawled_url(url_without_protocol) and not self.waiting_queue.full():
                    self.waiting_queue.put_nowait(req['originUrl'])
                    self.crawled_urls.add(url_without_protocol)

    def crawl_handler(self, url) -> list:
        result = []

        def intercept(route: Route, request: Request):
            # 拦截前端跳转,主要方法是修改请求响应为204 TODO: 后续在遇到前端跳转的时候,优化hook逻辑
            if request.is_navigation_request() and request.frame.parent_frame:
                request.response().status = 204
                route.continue_()
                return
            # 尝试拦截后端跳转
            if request.redirected_to:
                if request.post_data_json:
                    request.response().status = 200
                    self.waiting_queue.put_nowait(request.redirected_to.url)
                else:
                    ...
                route.continue_()
                return
            resource_type = request.resource_type
            # 过滤动态请求
            if resource_type in ['image', 'media', 'eventsource', 'websocket']:
                route.abort()
            else:
                url_origin = request.url
                if not url_origin:
                    route.continue_()
                    return
                if not self.filter_ext(url=url_origin) and self.filter_url_by_domain(url=url_origin):
                    headers = request.headers
                    method = request.method
                    post_data_json: dict = request.post_data_json
                    http_har = dict()
                    if method == 'POST' or method == 'PUT':
                        post_data_origin = post_data_json
                        post_data_handled = self._parse_post_data(post_data_origin)
                        content_type = headers['content-type'] if 'content-type' in headers else ''
                        http_har['originPostData'] = post_data_origin
                        http_har['postData'] = post_data_handled
                        http_har['contentType'] = content_type
                        http_har['url'] = url_origin
                        http_har['originUrl'] = url_origin
                        http_har['method'] = method
                    if method == 'GET':
                        http_har = self.parse_static_url(url_origin)
                    result.append(http_har)
                route.continue_()

        with sync_playwright() as p:
            browser = p.webkit.launch(headless=True, chromium_sandbox=True, )
            page = browser.new_page()
            page.set_default_navigation_timeout(30000)
            page.set_extra_http_headers(self.headers)
            page.route('**/*', intercept)
            page.goto(url)
            page.wait_for_load_state(state='networkidle', timeout=30000)

            tmp_links = self.static_crawler(page, result, url)
            page.evaluate(FORM_FILL_UPLOAD_JS)
            for link in tmp_links:
                link.click()
            page.close()
            browser.close()
        return result
Пример #8
0
class Crawler(object):

    def __init__(self, depth: int, max_url_nums: int, cookies: str, exclude_urls: List[str],
                 domain_reg_list: List[str],
                 path_dicts: List[str], header: dict = None):
        self.domain_reg = ''
        self.domain_reg_list = domain_reg_list
        self.complement = 0
        self.depth = 5 if not depth else depth
        self.max_url_nums = 5000 if not max_url_nums else max_url_nums
        self.cookie = cookies
        self.exclude_urls = [url.replace('*', '\\S*') for url in exclude_urls]
        self.url_dict = dict()
        self.url_cache = BloomFilter(element_num=max_url_nums * 5, error_rate=0.01)
        self.current_depth = 0
        self.current_crawl_queue = list()
        self.next_crawl_queue = list()
        self.max_queue_length = self.max_url_nums + 1000
        self.header = header
        self.path_dicts = path_dicts
        self.header = header
        self.filter_exts = [
            'css', 'png', 'gif', 'jpg', 'jpeg', 'swf', 'tiff',
            'pdf', 'ico', 'flv', 'mp4', 'mp3', 'avi', 'mpg', 'gz',
            'mpeg', 'iso', 'dat', 'mov', 'rar', 'exe', 'zip', 'tar',
            'bin', 'bz2', 'xsl', 'doc', 'docx', 'ppt', 'pptx', 'xls',
            'xlsx', 'csv', 'map', "ttf", 'tif', 'woff', 'woff2',
            'cab', 'apk', 'bmp', 'svg', 'exif', 'xml', 'rss', 'webp'
        ]
        self.exclude_urls_reg_str = ''

    @staticmethod
    def parse_domain(domain_list):
        """
        需要将输入的url或者域名解析成域名, 用于后续同域判断等操作
        :param domain_list:
        :return:
        """

        def _split_url_protocol_and_path(domain):
            # 去掉协议
            if '://' in domain:
                domain = domain.split('://')[1]
            # 截取路径
            if '.com.cn' in domain:
                return domain.split('.com.cn')[0] + '.com.cn'
            if '.com' in domain:
                return domain.split('.com')[0] + '.com'
            if '.xyz' in domain:
                return domain.split('.xyz')[0] + '.xyz'
            # 针对ip:port形式的url,截取/即可
            return domain.split('/')[0]

        return [_split_url_protocol_and_path(domain) for domain in domain_list]

    def _init_reg(self):
        """
        根据解析出来的域名拼接一个正则, 用于同域校验
        :return:
        """
        domain_reg = ['^']
        if '*.  .com.cn' in self.domain_reg_list or '*.  .com' in self.domain_reg_list:
            domain_reg.append('.*\.  \.com.*$')
            self.domain_reg = ''.join(domain_reg)
            return
        domain_reg.extend(['(http|https):\/\/' + domain.replace('.', '\.') + '.*|' for domain in self.domain_reg_list])
        # domain_reg.extend(map(lambda x: '(http|https):\/\/' + x.replace('.', '\.') + '.*|', self.domain_reg_list))
        tmp_domain_reg = ''.join(domain_reg)
        self.domain_reg = tmp_domain_reg[:-1] + '$'

    def run(self, domain_list):
        if not isinstance(domain_list, list):
            raise Exception('domains must be list')
        self.domain_reg_list = self.parse_domain(domain_list) if not self.domain_reg_list else self.domain_reg_list
        self._init_reg()
        self._consist_exclude_urls_regex()
        # for domain in domain_list:
        #     self.crawl_url(domain)
        tasks = [gevent.spawn(self.crawl_url(domain)) for domain in domain_list]
        gevent.joinall(tasks)
        print('all tasks done')
        print(self.url_dict)

    def crawl_url(self, domain):
        print('enter crawler:{}'.format(domain))
        if 'http' not in domain:
            init_url = 'http://' + domain
        else:
            init_url = domain

        self.current_crawl_queue.append(init_url)
        while self.current_depth < self.depth:
            if len(self.url_dict.keys()) >= self.max_url_nums:
                break
            print('now depth is:{}'.format(self.current_depth))
            tasks = [gevent.spawn(self._crawler_handler, url) for url in self.current_crawl_queue if
                     not url.endswith('.js')]
            gevent.joinall(tasks)
            # 将下一轮待爬取的url提升到当前
            self.current_crawl_queue = self.next_crawl_queue
            self.current_depth += 1

    def filter_url_by_domain(self, url):
        """
        检验当前的url是否满足条件
        是  的url以及不在[不需要]的url集合里,返回True.不满足要求,返回false
        :param url:
        :return:
        """
        # 校验域名
        if not re.match(self.domain_reg, url, flags=0):
            return False
        if len(self.exclude_urls) == 0:
            return True
        # 校验exclude_urls
        if re.match(self.exclude_urls_reg_str, url, flags=0):
            return False
        return True

    def filter_ext(self, url):
        """
        过滤掉特殊后缀的url, 如一些静态资源等等
        如果存在url的后缀是需要排除的,则排除
        :param url:
        :return:
        """
        try:
            f = url.split('/')[-1].strip()
            if '.' in f:
                ext = f.split('.')[-1].strip().lower()
                if ext and ext in self.filter_exts:
                    return True
                else:
                    return False
            return True
        except Exception as e:
            msg = traceback.format_exc()
            print(msg)
            return False

    def _check_crawled_url(self, url):
        """
        检查是否已爬取,不存在,则返回True
        :param url:
        :return:
        """
        if url in self.url_cache:
            return False
        return True

    def _check_url_is_exist_by_md5(self, url_dict):
        """
        利用MD5去检查url是否重复
        :param url_dict:
        :return:
        """
        try:
            exist_md5 = list(url_dict.keys())[0]
            if exist_md5 in self.url_dict:
                return False
            return True
        except Exception as e:
            msg = traceback.format_exc()
            print(msg)
            return True

    def _handle_url(self, req_list):
        """
        处理爬到的url, 看看是不是需要过滤或者是不是已经爬取过了
        :param req_list:
        :return:
        """
        if not req_list:
            return
        insert_req_list = list()
        for req in req_list:
            url = req['originUrl']
            if url.endswith('/'):
                url = url[:-1]
            url_without_protocol = url.split('//')[-1]
            '''
            解析完成后,返回的结构体包括:url,queryString(if exist),method
            需要对url做判断:
            1、是否存在于最后的url集合里
            2、是否已爬过
            3、url的后缀是否在需要过滤的集合里(最先判断,如果需要过滤则直接忽略)
            '''
            md5 = self.calculate_md5(req)
            tmp_dict = {
                md5: req
            }
            if self._check_url_is_exist_by_md5(tmp_dict):
                if len(self.url_dict.keys()) < self.max_url_nums:
                    self.url_dict[md5] = req
                    insert_req_list.append({'taskId': self.task_id, 'urlDict': json.dumps(req)})
                # 如果没有爬过,则放入下一轮要爬取的队列里
                if self._check_crawled_url(url_without_protocol):
                    # 待爬队列已满,则不往队列里添加元素
                    if len(self.next_crawl_queue) < self.max_queue_length:
                        self.next_crawl_queue.append(req['originUrl'])
                        self.url_cache.add(url_without_protocol)

    # 计算MD5
    @staticmethod
    def calculate_md5(url_har):
        url = url_har['url']
        # 有些post请求后缀会加timestamp时间戳来防重放
        tmp_list = url.split('//')[-1].split("?")
        url_without_protocol = tmp_list[0] if len(tmp_list) > 1 else url.split('//')[-1]
        method = url_har['method']
        query_string = ''
        post_data = ''
        if 'queryString' in url_har:
            query_string = url_har['queryString']
        if 'postData' in url_har:
            post_data = url_har['postData']
        tmp_str = url_without_protocol + '&' + method + '&' + query_string + post_data
        return hashlib.md5(tmp_str.encode('utf-8')).hexdigest()

    @staticmethod
    def parse_static_url(url):
        """
        把解析到的静态url, 重新组合成一个字典
        {
            'url': 'xxxxxx',
            'originUrl': 'xxxxxx/a=aa',
            'method': 'GET',
            'queryString': 'a=aa'
        }
        :param url:
        :return:
        """
        try:
            req = dict()
            req['method'] = 'GET'
            req['originUrl'] = url
            if '?' not in url:
                req['url'] = url
                return req
            url_consist = url.split('?')
            req['url'] = url_consist[0]
            params = url_consist[1]
            if '&' not in params:
                params_consist = params.split('=')
                req['queryString'] = params_consist[0] if params_consist[0] else ''
                return req
            multi_params = params.split('&')
            params_list = list(map(lambda y: y.split('=')[0], filter(lambda x: '=' in x, multi_params)))
            # 按首字母把参数排序
            params_list.sort()
            req['queryString'] = ''.join([key + '=&' for key in params_list])[:-2]
            return req
        except Exception as e:
            msg = traceback.format_exc()
            print(msg)
            return None

    @staticmethod
    def _parse_post_data(post_data) -> str:
        """
        解析动态请求获取里面的data成一个字符串
        :param post_data:
        :return:
        """
        if not post_data:
            return ''
        if not isinstance(post_data, dict):
            if '=' in post_data:
                param_dict = {}
                if '&' in post_data:
                    params_couples = post_data.split('&')
                    for param in params_couples:
                        if '=' not in param:
                            continue
                        k, v = param.split('=')
                        param_dict[k] = v
                else:
                    k, v = post_data.split('=')
                    param_dict[k] = v
                post_data = param_dict
            else:
                post_data = json.loads(post_data)
        post_data_list = [k for k, __ in post_data.items()]
        post_data_list.sort()
        return ''.join([param + '&' for param in post_data_list])[:-1]

    def _consist_exclude_urls_regex(self):
        self.exclude_urls_reg_str = '|'.join(self.exclude_urls) if len(self.exclude_urls) else ''
        print('exclude url reg is'.format(self.exclude_urls_reg_str))

    def static_crawler(self, page, results, url) -> List["ElementHandle"]:
        """
        主要用于页面中静态url的解析, 目前涵盖了a标签的href属性和src属性
        """
        links = page.query_selector_all("//a")
        tmp_link = []
        for link in links:
            href = link.get_property("href").json_value()
            src = link.get_property("src").json_value()
            if not href or href == url:
                continue
            if self.filter_ext(url=href) and self.filter_url_by_domain(url=href):
                req = self.parse_static_url(href)
                if req:
                    print('href:{}'.format(req))
                    results.append(req)
            if not src or src == url:
                continue
            if self.filter_ext(url=src) and self.filter_url_by_domain(url=src):
                req = self.parse_static_url(src)
                if req:
                    print('src:{}'.format(req))
                    results.append(req)
            if 'javascript' in href or 'javascript' in src:
                tmp_link.append(link)
        return tmp_link

    def _crawler_handler(self, url):
        print('start crawling url:{}'.format(url))
        results = []

        def log_and_continue_request(route: Route, request: Request):
            resource_type = request.resource_type
            '''请求过滤'''
            if resource_type in ['image', 'media', 'eventsource', 'websocket']:
                route.abort()
            else:
                url_origin = request.url
                headers = request.headers
                method = request.method
                post_data_json = request.post_data_json
                print(url_origin, headers, method, post_data_json)
                if not url_origin:
                    route.continue_()
                    return
                if not self.filter_url_by_domain(url_origin) or self.filter_ext(url_origin):
                    route.continue_()
                    return
                http_har = dict()
                if method == 'POST' or method == 'PUT':
                    post_data_origin = post_data_json
                    post_data_handled = self._parse_post_data(post_data_origin)
                    content_type = headers['content-type'] if 'content-type' in headers else ''
                    http_har['originPostData'] = post_data_origin
                    http_har['postData'] = post_data_handled
                    http_har['contentType'] = content_type
                    http_har['url'] = url_origin
                    http_har['originUrl'] = url_origin
                    http_har['method'] = method
                if method == 'GET':
                    http_har = self.parse_static_url(url_origin)
                results.append(http_har)
                route.continue_()

        with sync_playwright() as p:
            browser = p.webkit.launch(headless=True, chromium_sandbox=True, )
            page = browser.new_page()
            page.set_default_navigation_timeout(30000)
            if self.cookie:
                self.header['Cookie'] = self.cookie
            page.set_extra_http_headers(headers=self.header)
            page.route('**/*', log_and_continue_request)
            page.goto(url)
            page.wait_for_load_state(state="networkidle", timeout=30000)

            tmp_link = self.static_crawler(page, results, url)
            for link in tmp_link:
                link.click()
                self.static_crawler(page, results, url)

            browser.close()

        self._handle_url(results)