def get_url(self): retry_counter = 0 while True: try: page_url = self.url_file.popitem()[0] logging.info(page_url) return page_url except Exception as ex: if retry_counter < 3: time.sleep(1) retry_counter += 1 else: raise TypeError("页面无数据")
def monitor(self): while True: # 待爬取的url数量 time.sleep(self.sleep_time) monitorModel = self.crawler.monitorModel.statistical() logging.info( "statistical: success[{0}] failure [{1}] schedular[{2}] duplicate [{3}] filter [ {4} ]" .format(monitorModel.success_count, monitorModel.fail_count, monitorModel.crawler_count, monitorModel.duplicate_count, monitorModel.filter_count)) logging.info("filter url {0}".format("\n".join( monitorModel.filter_urls)))
def process(self, page): res_html = self.handler(page) img_urls = res_html.xpath("//img/@src") imgs = [] for img_url in img_urls: if str(img_url).find("/skin/") > -1: continue if str(img_url).startswith("/"): img_url = page.base_url + img_url[1:] imgs.append(img_url) page.put_fields("img", imgs) logging.info(imgs)
def get_url(self): retry_counter = 0 try: self.scheduler_lock.acquire() while True: if len(self.urls) == 0: if retry_counter < 3: time.sleep(1) retry_counter += 1 else: raise TypeError("页面无数据") else: page_url = self.urls.pop() logging.info(page_url) return page_url except Exception as ex: logging.error(ex) finally: self.scheduler_lock.release() return None
def storage(self, json_data): if not json_data: return # 判断文件是否写满 放置到锁同步中 防止产生空文件 try: self.storage_lock.acquire() if self.current_counter >= self.file_counter: self.json_file.close() self.json_file = open(os.path.join(self.json_path, str(int(time.time() * 1000)) + ".json"), "w") self.current_counter = 0 article = json.dumps(json_data, ensure_ascii=False) logging.info(article) self.json_file.write(article + "\n") self.current_counter += 1 except Exception as ex: traceback.print_exc() logging.error(json_data) finally: self.storage_lock.release()
def storage(self, field_dict): if not field_dict: return for field_name in field_dict: field_values = field_dict.get(field_name) if not isinstance(field_values, list): field_values = [field_values] for field_value in field_values: # 获取url路径名称 try: logging.info(field_value) # 下载内容信息 content = urllib2.urlopen(field_value).read() # 将内容写入到文件中 with open( os.path.join(self.out_path, os.path.split(field_value)[1]), "wb") as media_file: media_file.write(content) except Exception as ex: logging.error(ex)
def sitecustomize(): for sys_path in sys.path: if sys_path.endswith("site-packages"): logging.info(sys_path) sitecustomize_path = os.path.join(sys_path, "sitecustomize.py") if not os.path.exists(sitecustomize_path): sitecustomize_file = open(sitecustomize_path, "w") sitecustomize_file.write("#!/usr/bin/env python\n") sitecustomize_file.write("# -*- coding: utf-8 -*-\n") sitecustomize_file.write("# @Time : 2018/8/9 23:55\n") sitecustomize_file.write("# @Author : ganliang\n") sitecustomize_file.write("# @File : sitecustomize.py\n") sitecustomize_file.write("# @Desc : 解决乱码问题\n") sitecustomize_file.write("import sys\n") sitecustomize_file.write("reload(sys)\n") sitecustomize_file.write("sys.setdefaultencoding('utf8')\n") sitecustomize_file.close() logging.info("write {0} sitecustomize.py success".format( sitecustomize_path)) break else: logging.info("sitecustomize.py exists")
def test_get_file_properties(self): json_data = {} fields = self.avroStorage.get_file_properties(json_data) logging.info(fields)
def __exit__(self, exc_type, exc_val, exc_tb): self.csv_writer.close() logging.info("__exit__")
def storage(self, field_dict): if field_dict: if isinstance(field_dict, dict): logging.info(json.dumps(field_dict, ensure_ascii=False)) else: logging.info(field_dict)
def process(url): simpleDownloader = SimpleDownloader() content = simpleDownloader.download(url) res_html = etree.HTML(content) vulbar_content_elements = res_html.xpath("//div[@class='vulbar']") if not vulbar_content_elements: return vulbar_content_element = vulbar_content_elements[0] title = getElement(vulbar_content_element, "//div[@align='center']/b/text()") logging.info("title:" + title) refect_product = getElement(vulbar_content_element, "//blockquote/text()") logging.info("refect_product:" + refect_product) cve_link, vender_link, advisory_link = vulbar_content_element.xpath( "a/@href") cve_id = vulbar_content_element.xpath("a/text()")[0] logging.info("cve_link:" + cve_link) logging.info("cve_id:" + cve_id) logging.info("vender_link:" + vender_link) logging.info("advisory_link:" + advisory_link) main_content = etree.tostring(vulbar_content_element, encoding="utf-8").decode("utf-8") main_content = str(main_content) pubdate_index = main_content.find("发布日期") pubdate_before_index = main_content.find(">", pubdate_index) + 1 pubdate_after_index = main_content.find("<", pubdate_before_index) pubdate = main_content[pubdate_before_index:pubdate_after_index] logging.info("pubdate:" + pubdate) update_index = main_content.find("更新日期") update_before_index = main_content.find(">", update_index) + 1 update_after_index = main_content.find("<", update_before_index) update_date = main_content[update_before_index:update_after_index] logging.info("update_date:" + update_date) desc_index = main_content.find("描述:") desc_before_index = main_content.find("<br/>", desc_index) + 12 desc_after_index = main_content.find("<br/> ", desc_before_index) desc = main_content[desc_before_index:desc_after_index] logging.info("desc:" + desc) desc_before_index = main_content.find("<br/> ", desc_after_index) + 23 desc_after_index = main_content.find("<br/>", desc_before_index) desc2 = main_content[desc_before_index:desc_after_index] logging.info("desc:" + desc2) advisory_begin_index = main_content.rfind("厂商补丁:<br/>") + len( "厂商补丁:<br/>") + 6 advisory_end_index = main_content.rfind(" ") advisory = main_content[advisory_begin_index:advisory_end_index] logging.info("advisory:" + advisory)