def from_crawler(cls, crawler): # This method is used by Scrapy to create your spiders. PrintFormatUtil.print_line("重新定义crawler-spider") s = cls() # 绑定节点的监听事件 crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) crawler.signals.connect(s.spider_closed, signal=signals.spider_closed) return s
def process_response(self, request, response, spider): PrintFormatUtil.print_line("spider {} : 开始处理 Response".format(spider.name)) # Called with the response returned from the downloader. # Must either; # - return a Response object # - return a Request object # - or raise IgnoreRequest return response
def start_requests(self): for service_name, service_url in self.link_list.items(): PrintFormatUtil.print_line("检查{}的页面, url {}".format( service_name, service_url)) args_dict = {'title': service_name} yield SeleniumRequest(url=service_url, callback=self.parse, screen_shot=True, wait_time=20, r_dict=args_dict)
def start_requests(self): for p in self.link_list: PrintFormatUtil.print_line("检查({}){}的页面, url {}".format( p.get_id(), p.get_name(), p.get_url())) args_dict = {'title': p.get_name(), 'id': p.get_id()} yield SeleniumRequest(url=p.get_url(), callback=self.parse, screen_shot=True, wait_time=20, r_dict=args_dict)
def spider_opened(self, spider): PrintFormatUtil.print_line("spider {} : 开始处理".format(spider.name)) PrintFormatUtil.print_line("spider {} , 运行模式 {}".format(spider.name, spider.crawl_type.value)) if spider.crawl_type.value == 'selenium': chrome_options = Options() list([chrome_options.add_argument(x) for x in CONST.CHROME_DRIVER_OPTIONS]) self.driver = webdriver.Chrome(chrome_options=chrome_options, executable_path=CONST.CHROME_DRIVER_BIN_PATH) if spider.crawl_type.value == 'puppeeter': pyppeteer_level = logging.WARNING logging.getLogger('pyppeteer').setLevel(pyppeteer_level) logging.getLogger('websockets.protocol').setLevel(pyppeteer_level) pyppeteer_logger = logging.getLogger('pyppeteer') pyppeteer_logger.setLevel(logging.WARNING) self.driver = sync(launch({'Headless': True, 'args': ['--no-sandbox', '--disable-gpu'], 'dumpio': True}))
def output_diff(self): try: diff = ImageChops.difference(self.i_a, self.i_b) if diff.getbbox(): PrintFormatUtil.print_line("存在差异, 生成差异图片 {}".format( self.i_diff)) point_table = ([0] + ([255] * 255)) diff = diff.convert('L') diff = diff.point(point_table) new = diff.convert('RGB') new.paste(self.i_b, mask=diff) new.save(self.i_diff) except ValueError as e: text = ( "表示图片大小和box对应的宽度不一致,参考API说明:Pastes another image into this image." "The box argument is either a 2-tuple giving the upper left corner, a 4-tuple defining the left, upper, " "right, and lower pixel coordinate, or None (same as (0, 0)). If a 4-tuple is given, the size of the pasted " "image must match the size of the region.使用2纬的box避免上述问题") PrintFormatUtil.print_line("【{0}】{1}".format(e, text))
def parse(self, response): data = json.loads(response.body.decode('utf-8')) assert 'product' in data and len( data['product']) > 0, "URL{} 数据不符合要求 ".format(response.url) for category_1 in data['product']: if 'name' in data['product'][category_1]: PrintFormatUtil.print_line("处理一级类目{}".format(category_1)) for service in data['product'][category_1]['name']: item = ProductItem() item['category1'] = service['category1'] if 'category2' in service: item['category2'] = service['category2'] else: item['category2'] = '' item['title'] = service['title'] item['description'] = service['description'] item['link'] = service['textLink'] if "%s:%s" % (service['category1'], service['title']) not in self.product_list: PrintFormatUtil.print_line("获取子服务{}".format(item)) yield item
def spider_closed(self, spider): """Shutdown the driver when spider is closed""" PrintFormatUtil.print_line("spider {} : 结束处理".format(spider.name)) if spider.crawl_type.value == 'selenium' and not self.driver is None: PrintFormatUtil.print_line("spider {} : selenium driver 销毁".format(spider.name)) self.driver.close() self.driver.quit() if spider.crawl_type.value == 'puppeeter' and not self.driver is None: PrintFormatUtil.print_line("spider {} : puppeeter driver 销毁".format(spider.name)) sync(self.driver.close())
def process_item(self, item, spider): PrintFormatUtil.print_line(self.__module__) if spider.name == CONST.ALI_YUN_PRODUCT_SPIDER_NAME: result = spider_db.save_product_item_to_db(item) if result == 1: PrintFormatUtil.print_success("spider %s success ..." % str(item).strip()) self.counter[spider.name].add() else: PrintFormatUtil.print_error("spider %s fail ..." % str(item).strip()) return item
def parse(self, response): service_pic_path = os.path.join(CONST.PIC_PATH, response.meta['r_dict']['title']) os.makedirs(service_pic_path, exist_ok=True) current_time = str(int(time.time())) service_pic_name = os.path.join(service_pic_path, current_time + ".png") service_pic_small_name = os.path.join(service_pic_path, current_time + "_s.png") service_pic_diff_name = os.path.join(service_pic_path, current_time + "_diff.png") service_pic_oc_diff_name = os.path.join(service_pic_path, current_time + "_oc_diff.png") PrintFormatUtil.print_line("pic save path {}".format(service_pic_name)) with open(service_pic_name, 'wb') as image_file: image_file.write(response.meta['screen_shot']) # 设置图片压缩 image = Image.open(service_pic_name) w, h = image.size PrintFormatUtil.print_line("原有图片大小 width {} height {}".format(w, h)) d_img = image.resize((int(w / 2), int(h / 2)), Image.ANTIALIAS) w, h = d_img.size PrintFormatUtil.print_line("处理后的图片大小 width {} height {}".format(w, h)) d_img.save(service_pic_small_name, quality=95) del response, image # 读取latest文件 latest_path = os.path.join(service_pic_path, 'latest') if os.path.exists(latest_path) and os.path.isfile(latest_path): with open(latest_path, 'r') as f: old_file_info = f.read() old_file_info = old_file_info.split(" ") old_file_info_name = old_file_info[0] old_file_info_md5 = old_file_info[1] old_service_pic_name = os.path.join(service_pic_path, old_file_info_name) PrintFormatUtil.print_line( "old pic path {}".format(old_service_pic_name)) if old_file_info_md5 == FileUtil.get_md5(old_service_pic_name): PrintFormatUtil.print_line("比对图片 {} | {}".format( service_pic_small_name, old_file_info_name)) # 比对图片(PIL和OPENCV两种模式) iss = ImageSSIM(service_pic_small_name, old_service_pic_name, service_pic_diff_name) o_iss = OpenCVSSIM(service_pic_small_name, old_service_pic_name, service_pic_oc_diff_name) pil_s_code = iss.compare_images() oc_s_code = o_iss.compare_images() PrintFormatUtil.print_line("PIL库两者的相似度: {}".format(pil_s_code)) PrintFormatUtil.print_line( "OPEN_CV库两者的相似度: {}".format(oc_s_code)) # 这个值可以设置(0-1), 1 非常严格 if pil_s_code < 1 and oc_s_code < 1: iss.output_diff() o_iss.output_diff() else: PrintFormatUtil.print_line( "old pic md5 error. new {} old {}".format( FileUtil.get_md5(old_service_pic_name), old_file_info_md5)) # 重新生成latest文件 with open(latest_path, "w") as file: file.write( os.path.basename(service_pic_small_name) + " " + FileUtil.get_md5(service_pic_small_name))
def close_spider(self, spider): PrintFormatUtil.print_line(self.__class__.__name__) total = self.counter[spider.name].total() PrintFormatUtil.print_line("total %s record inserted" % total) PrintFormatUtil.print_title(" spider %s finished " % spider.name)
def open_spider(self, spider): PrintFormatUtil.print_title(" spider %s started " % spider.name) PrintFormatUtil.print_line(self.__class__.__name__) self.counter[spider.name] = SpiderCounter() self.counter[spider.name].create_counter()
def process_request(self, request, spider): PrintFormatUtil.print_line("spider {} : 开始处理 Request".format(spider.name)) if spider.crawl_type.value == 'selenium' and isinstance(request, SeleniumRequest): self.driver.set_window_size(800, 600) self.driver.get(request.url) # copy cookie for cookie_name, cookie_value in request.cookies.items(): self.driver.add_cookie( { 'name': cookie_name, 'value': cookie_value } ) if request.wait_until: WebDriverWait(self.driver, request.wait_time).until(request.wait_until) if request.screen_shot: # Get the actual page dimensions using javascript width = self.driver.execute_script( "return Math.max(document.body.scrollWidth, document.body.offsetWidth, " "document.documentElement.clientWidth, document.documentElement.scrollWidth, " "document.documentElement.offsetWidth);") height = self.driver.execute_script( "return Math.max(document.body.scrollHeight, document.body.offsetHeight, " "document.documentElement.clientHeight, document.documentElement.scrollHeight, " "document.documentElement.offsetHeight);") # resize PrintFormatUtil.print_line("reset size {}:{}".format(width, height)) self.driver.set_window_size(width, height) time.sleep(1) request.meta['screen_shot'] = self.driver.get_screenshot_as_png() if request.script: self.driver.execute_script(request.script) request.meta['r_dict'] = request.r_dict body = str.encode(self.driver.page_source) # Expose the driver via the "meta" attribute request.meta.update({'driver': self.driver}) return HtmlResponse( self.driver.current_url, body=body, encoding='utf-8', request=request ) if spider.crawl_type.value == 'puppeeter' and isinstance(request, PuppeeterRequest): page = sync(self.driver.newPage()) sync(page.setJavaScriptEnabled(enabled=True)) sync(page.setUserAgent( 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1')) sync(page.goto(request.url)) request.meta['r_dict'] = request.r_dict body = str.encode(sync(page.content())) # Expose the driver via the "meta" attribute request.meta.update({'driver': self.driver}) request.meta.update({'page': page}) return HtmlResponse( request.url, body=body, encoding='utf-8', request=request ) # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called return None
def start_requests(self): PrintFormatUtil.print_line("阿里云已有的服务一共{}个".format( len(self.product_list))) yield Request(CONST.ALI_YUN_PRODUCT_URL, callback=self.parse, headers=CONST.ALI_YUN_HEADER)