Пример #1
0
 def get_format_html(cls, html, final_url):
     """获得标准化的html"""
     final_protocol = UrlUtil.get_protocol(final_url)
     final_domain = UrlUtil.get_domain(final_url)
     format_html = HtmlUtil.parse_protocol(html, final_protocol)
     format_html = HtmlUtil.parse_domain(format_html, final_protocol, final_domain)
     return format_html
Пример #2
0
    def download(self, main_item, after_scroll_time=1):
        """下载一个web页面"""
        if not isinstance(main_item, MainItem):
            logging.error("Received param must items.MainItem, but get " +
                          str(type(main_item)))
            return None
        start_time = time.time()
        try:
            self.driver.get(main_item.request_url)  # 请求页面
            # todo 存储图片 screenshot_base64 = self.driver.get_screenshot_as_base64()
        except TimeoutException as e:
            logging.info("Get url:" + main_item.request_url + ", msg: " +
                         e.msg)
            self.driver.execute_script("window.stop()")
        except WebDriverException as e:
            logging.error("When download page, error class: %s, message: %s." %
                          (e.__class__, e.msg))
            options = Options()
            options.add_argument('-headless')
            self.driver = Firefox(firefox_options=options)
            logging.info("Webdriver reinit")
        finally:
            load_time = time.time() - start_time
            logging.info("Get url:" + main_item.request_url + " spend " +
                         str(load_time) + "s.")
        server_ip = socket.gethostbyname(
            UrlUtil.get_domain(main_item.request_url))
        js_scroll = """
                    function go_down() {
                        var h = document.documentElement.scrollHeight || document.body.scrollHeight;
                        window.scroll(h, h);
                    }
                    go_down()
                """  # 翻页JS
        try:
            self.driver.execute_script(js_scroll)  # 执行翻页
            time.sleep(after_scroll_time)  # 执行了翻页后等待页面加载nS
        except WebDriverException as e:
            logging.error(
                "When scroll page, error class: %s, error message: %s" %
                (e.__class__, e.msg))
        current_url = None
        page_source = None
        try:
            current_url = self.driver.current_url
            page_source = self.driver.page_source
        except UnexpectedAlertPresentException as e:
            logging.info("点击弹出框")
            try:
                self.driver.switch_to.alert.accept()
            except NoAlertPresentException:
                # 把框点没了就会抛出这个异常,不知这什么鬼设计
                pass
            try:
                current_url = self.driver.current_url
                page_source = self.driver.page_source
            except WebDriverException as e:
                logging.error(e.msg)
        except WebDriverException as e:
            logging.error(e.msg)
        finally:
            if not current_url:
                current_url = "Something error occurred, please check the error log."
            if not page_source:
                page_source = "Something error occurred, please check the error log."

        # 填充现有信息
        main_item.final_url = current_url
        # download_item.screen_shot = screenshot_base64
        main_item.load_time = load_time
        main_item.html = page_source
        time_array = time.localtime(int(time.time()))
        main_item.get_time = time.strftime("%Y-%m-%d %H:%M:%S", time_array)
        with open("/etc/internet-snapshot.conf", "r") as f:
            ext_conf = json.load(f)
            main_item.send_ip = ext_conf["ip"]
        main_item.server_ip = server_ip

        return main_item
Пример #3
0
    def parse(self):
        if not isinstance(self.downloader_item, MainItem):
            logging.error("The param type is: " +
                          str(type(self.downloader_item)) +
                          ", but it should be MainItem.")
            return None
        html = self.downloader_item.html

        # 将downloader_item存库
        with self.connection.cursor() as cursor:
            sql = 'INSERT INTO snapshot (request_url, final_url, load_time, refer, get_time,' \
                  ' task_id, send_ip, server_ip, deepth) VALUES (%s, %s, %s, %s, %s, %s, %s, ' \
                  '%s, %s);'
            result = cursor.execute(sql, self.downloader_item.save_tuple())
            if result != 1:
                logging.error("snapshot插入记录" +
                              self.downloader_item.save_tuple() + "失败!")

        # 拿到刚刚存库记录的id
        with self.connection.cursor() as cursor:
            sql = 'SELECT last_insert_id() as ss_id;'
            cursor.execute(sql)
            result = cursor.fetchone()
            ss_id = result["ss_id"]

        # 将页面内容存库
        ss_html = SsHtmlItem(ss_id=ss_id, html=html)
        with self.connection.cursor() as cursor:
            sql = 'INSERT INTO ss_html (ss_id, html) VALUES (%s, %s);'
            result = cursor.execute(sql, ss_html.save_tuple())
            if result != 1:
                logging.error("ss_html插入记录" + ss_html.save_tuple() + "失败!")

        # 规范化一下页面内链接
        final_protocol = UrlUtil.get_protocol(self.downloader_item.final_url)
        final_domain = UrlUtil.get_domain(self.downloader_item.final_url)
        format_html = HtmlUtil.parse_protocol(html, final_protocol)
        format_html = HtmlUtil.parse_domain(format_html, final_protocol,
                                            final_domain)

        tree = etree.HTML(format_html)
        hrefs = tree.xpath("//@href")  # 拿到所有a标签中的链接对象
        iframes = tree.xpath("//iframe/@src")  # 拿到所有iframe的源链接
        jss = tree.xpath("//script/@src")  # 拿到所有的js链接
        hrefs.extend(iframes)
        hrefs.extend(jss)
        if hrefs:
            hrefs = href_clean(hrefs)
        else:
            hrefs = list()
        inner_chains = set()  # 内链列表,返回给引擎迭代
        unknown_domains = set()  # 可疑外链主域名列表,存库人工复查
        request_top_domain = UrlUtil.get_top_domain(
            self.downloader_item.request_url)
        for href in hrefs:
            this_top_domain = UrlUtil.get_top_domain(href)
            if request_top_domain == this_top_domain and UrlUtil.get_url_suffix(
                    href) != "js":
                inner_chains.add(href)
            elif this_top_domain not in self.safe_chains and not UrlUtil.is_gov_or_edu(
                    href):
                # 主域名不在白名单里而且不是政府或教育机构网站
                unknown_domains.add(this_top_domain)

        # 将须迭代的内链包装对象放入redis
        logging.info("Length of inner_chains is " + str(len(inner_chains)))
        dup_set_name = "engine:dup_set:" + str(self.downloader_item.task_id)
        queue_name = "engine:queue:" + str(self.downloader_item.task_id)
        for inner_chain in inner_chains:
            if isinstance(self.redis_conn.ttl(dup_set_name), int):
                sadd_re = self.redis_conn.sadd(dup_set_name, inner_chain)
                if sadd_re == 1:  # 等于1说明上条插入成功,没有重复,省了一次查重
                    new_main_item = MainItem(
                        inner_chain,
                        refer=self.downloader_item.final_url,
                        task_id=self.downloader_item.task_id,
                        deepth=self.downloader_item.deepth + 1)
                    self.redis_conn.lpush(
                        queue_name,
                        json.dumps(new_main_item, default=main_item_to_json))
        # 将可疑外链存库
        for unknown_domain in unknown_domains:
            with self.connection.cursor() as cursor:
                sql = "SELECT mydomain FROM malicious_domains;"
                cursor.execute(sql)
                malicious_records = cursor.fetchall()
            malicious_domains = set([
                malicious_record["mydomain"]
                for malicious_record in malicious_records
            ])
            if unknown_domain in malicious_domains:
                suspicious_item = SuspiciousItem(
                    ss_id, unknown_domain, 1, 1,
                    time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time())))
            else:
                suspicious_item = SuspiciousItem(ss_id, unknown_domain, 0, -1,
                                                 None)
            with self.connection.cursor() as cursor:
                sql = 'INSERT INTO suspicious_records (ss_id, unknown_domain, checked, result, ' \
                      'check_time) VALUES (%s, %s, %s, %s, %s)'
                result = cursor.execute(sql, suspicious_item.save_tuple())
                if result != 1:
                    logging.error("suspicious_records插入记录" +
                                  suspicious_item.save_tuple() + "失败!")

        self.connection.commit()
        self.connection.close()
        logging.info(self.downloader_item.request_url + " parse over.")