예제 #1
0
 def get_url(self):
     retry_counter = 0
     while True:
         try:
             page_url = self.url_file.popitem()[0]
             logging.info(page_url)
             return page_url
         except Exception as ex:
             if retry_counter < 3:
                 time.sleep(1)
                 retry_counter += 1
             else:
                 raise TypeError("页面无数据")
예제 #2
0
 def monitor(self):
     while True:
         # 待爬取的url数量
         time.sleep(self.sleep_time)
         monitorModel = self.crawler.monitorModel.statistical()
         logging.info(
             "statistical: success[{0}] failure [{1}] schedular[{2}] duplicate [{3}] filter [ {4} ]"
             .format(monitorModel.success_count, monitorModel.fail_count,
                     monitorModel.crawler_count,
                     monitorModel.duplicate_count,
                     monitorModel.filter_count))
         logging.info("filter url {0}".format("\n".join(
             monitorModel.filter_urls)))
예제 #3
0
    def process(self, page):
        res_html = self.handler(page)

        img_urls = res_html.xpath("//img/@src")

        imgs = []
        for img_url in img_urls:
            if str(img_url).find("/skin/") > -1:
                continue
            if str(img_url).startswith("/"):
                img_url = page.base_url + img_url[1:]
                imgs.append(img_url)
        page.put_fields("img", imgs)
        logging.info(imgs)
예제 #4
0
 def get_url(self):
     retry_counter = 0
     try:
         self.scheduler_lock.acquire()
         while True:
             if len(self.urls) == 0:
                 if retry_counter < 3:
                     time.sleep(1)
                     retry_counter += 1
                 else:
                     raise TypeError("页面无数据")
             else:
                 page_url = self.urls.pop()
                 logging.info(page_url)
                 return page_url
     except Exception as ex:
         logging.error(ex)
     finally:
         self.scheduler_lock.release()
     return None
예제 #5
0
    def storage(self, json_data):
        if not json_data:
            return

        # 判断文件是否写满 放置到锁同步中 防止产生空文件
        try:
            self.storage_lock.acquire()
            if self.current_counter >= self.file_counter:
                self.json_file.close()
                self.json_file = open(os.path.join(self.json_path, str(int(time.time() * 1000)) + ".json"), "w")
                self.current_counter = 0

            article = json.dumps(json_data, ensure_ascii=False)
            logging.info(article)
            self.json_file.write(article + "\n")
            self.current_counter += 1
        except Exception as ex:
            traceback.print_exc()
            logging.error(json_data)
        finally:
            self.storage_lock.release()
예제 #6
0
    def storage(self, field_dict):
        if not field_dict:
            return

        for field_name in field_dict:
            field_values = field_dict.get(field_name)
            if not isinstance(field_values, list):
                field_values = [field_values]
            for field_value in field_values:
                # 获取url路径名称
                try:
                    logging.info(field_value)
                    # 下载内容信息
                    content = urllib2.urlopen(field_value).read()
                    # 将内容写入到文件中
                    with open(
                            os.path.join(self.out_path,
                                         os.path.split(field_value)[1]),
                            "wb") as media_file:
                        media_file.write(content)
                except Exception as ex:
                    logging.error(ex)
예제 #7
0
파일: Install.py 프로젝트: mumupy/pcrawler
def sitecustomize():
    for sys_path in sys.path:
        if sys_path.endswith("site-packages"):
            logging.info(sys_path)
            sitecustomize_path = os.path.join(sys_path, "sitecustomize.py")
            if not os.path.exists(sitecustomize_path):
                sitecustomize_file = open(sitecustomize_path, "w")
                sitecustomize_file.write("#!/usr/bin/env python\n")
                sitecustomize_file.write("# -*- coding: utf-8 -*-\n")
                sitecustomize_file.write("# @Time    : 2018/8/9 23:55\n")
                sitecustomize_file.write("# @Author  : ganliang\n")
                sitecustomize_file.write("# @File    : sitecustomize.py\n")
                sitecustomize_file.write("# @Desc    : 解决乱码问题\n")
                sitecustomize_file.write("import sys\n")
                sitecustomize_file.write("reload(sys)\n")
                sitecustomize_file.write("sys.setdefaultencoding('utf8')\n")
                sitecustomize_file.close()
                logging.info("write {0} sitecustomize.py success".format(
                    sitecustomize_path))
                break
            else:
                logging.info("sitecustomize.py exists")
예제 #8
0
 def test_get_file_properties(self):
     json_data = {}
     fields = self.avroStorage.get_file_properties(json_data)
     logging.info(fields)
예제 #9
0
 def __exit__(self, exc_type, exc_val, exc_tb):
     self.csv_writer.close()
     logging.info("__exit__")
예제 #10
0
 def storage(self, field_dict):
     if field_dict:
         if isinstance(field_dict, dict):
             logging.info(json.dumps(field_dict, ensure_ascii=False))
         else:
             logging.info(field_dict)
예제 #11
0
def process(url):
    simpleDownloader = SimpleDownloader()
    content = simpleDownloader.download(url)
    res_html = etree.HTML(content)

    vulbar_content_elements = res_html.xpath("//div[@class='vulbar']")
    if not vulbar_content_elements:
        return
    vulbar_content_element = vulbar_content_elements[0]
    title = getElement(vulbar_content_element,
                       "//div[@align='center']/b/text()")
    logging.info("title:" + title)

    refect_product = getElement(vulbar_content_element, "//blockquote/text()")
    logging.info("refect_product:" + refect_product)

    cve_link, vender_link, advisory_link = vulbar_content_element.xpath(
        "a/@href")
    cve_id = vulbar_content_element.xpath("a/text()")[0]
    logging.info("cve_link:" + cve_link)
    logging.info("cve_id:" + cve_id)
    logging.info("vender_link:" + vender_link)
    logging.info("advisory_link:" + advisory_link)

    main_content = etree.tostring(vulbar_content_element,
                                  encoding="utf-8").decode("utf-8")
    main_content = str(main_content)

    pubdate_index = main_content.find("发布日期")
    pubdate_before_index = main_content.find(">", pubdate_index) + 1
    pubdate_after_index = main_content.find("<", pubdate_before_index)
    pubdate = main_content[pubdate_before_index:pubdate_after_index]
    logging.info("pubdate:" + pubdate)

    update_index = main_content.find("更新日期")
    update_before_index = main_content.find(">", update_index) + 1
    update_after_index = main_content.find("<", update_before_index)
    update_date = main_content[update_before_index:update_after_index]
    logging.info("update_date:" + update_date)

    desc_index = main_content.find("描述:")
    desc_before_index = main_content.find("<br/>", desc_index) + 12
    desc_after_index = main_content.find("<br/>&#13;", desc_before_index)
    desc = main_content[desc_before_index:desc_after_index]
    logging.info("desc:" + desc)

    desc_before_index = main_content.find("<br/>&#13;", desc_after_index) + 23
    desc_after_index = main_content.find("<br/>", desc_before_index)
    desc2 = main_content[desc_before_index:desc_after_index]
    logging.info("desc:" + desc2)

    advisory_begin_index = main_content.rfind("厂商补丁:<br/>") + len(
        "厂商补丁:<br/>") + 6
    advisory_end_index = main_content.rfind("&#13;")
    advisory = main_content[advisory_begin_index:advisory_end_index]
    logging.info("advisory:" + advisory)