Пример #1
0
def href_clean(hrefs):
    """清洗从a标签中提取出的href属性,去掉不是网页的条目"""
    result = list()
    not_web_page = ("ico", "svg", "css", "xml", "png", "jpg", "jpeg", "gif"
                    )  # 这些扩展名不是网页,去掉
    if isinstance(hrefs, list):
        for href in hrefs:
            # 是url并且是网页
            if re.match('[a-zA-z]+://[^\s]*', href) \
                    and UrlUtil.get_url_suffix(href) not in not_web_page:
                # 把lxml.etree._ElementUnicodeResult对象做下转换并去掉两侧无用空白字符
                href = str(href).strip()
                result.append(href)
    else:
        logging.error("Param type error, it should be list.")
    return result
Пример #2
0
    def parse(self):
        if not isinstance(self.downloader_item, MainItem):
            logging.error("The param type is: " +
                          str(type(self.downloader_item)) +
                          ", but it should be MainItem.")
            return None
        html = self.downloader_item.html

        # 将downloader_item存库
        with self.connection.cursor() as cursor:
            sql = 'INSERT INTO snapshot (request_url, final_url, load_time, refer, get_time,' \
                  ' task_id, send_ip, server_ip, deepth) VALUES (%s, %s, %s, %s, %s, %s, %s, ' \
                  '%s, %s);'
            result = cursor.execute(sql, self.downloader_item.save_tuple())
            if result != 1:
                logging.error("snapshot插入记录" +
                              self.downloader_item.save_tuple() + "失败!")

        # 拿到刚刚存库记录的id
        with self.connection.cursor() as cursor:
            sql = 'SELECT last_insert_id() as ss_id;'
            cursor.execute(sql)
            result = cursor.fetchone()
            ss_id = result["ss_id"]

        # 将页面内容存库
        ss_html = SsHtmlItem(ss_id=ss_id, html=html)
        with self.connection.cursor() as cursor:
            sql = 'INSERT INTO ss_html (ss_id, html) VALUES (%s, %s);'
            result = cursor.execute(sql, ss_html.save_tuple())
            if result != 1:
                logging.error("ss_html插入记录" + ss_html.save_tuple() + "失败!")

        # 规范化一下页面内链接
        final_protocol = UrlUtil.get_protocol(self.downloader_item.final_url)
        final_domain = UrlUtil.get_domain(self.downloader_item.final_url)
        format_html = HtmlUtil.parse_protocol(html, final_protocol)
        format_html = HtmlUtil.parse_domain(format_html, final_protocol,
                                            final_domain)

        tree = etree.HTML(format_html)
        hrefs = tree.xpath("//@href")  # 拿到所有a标签中的链接对象
        iframes = tree.xpath("//iframe/@src")  # 拿到所有iframe的源链接
        jss = tree.xpath("//script/@src")  # 拿到所有的js链接
        hrefs.extend(iframes)
        hrefs.extend(jss)
        if hrefs:
            hrefs = href_clean(hrefs)
        else:
            hrefs = list()
        inner_chains = set()  # 内链列表,返回给引擎迭代
        unknown_domains = set()  # 可疑外链主域名列表,存库人工复查
        request_top_domain = UrlUtil.get_top_domain(
            self.downloader_item.request_url)
        for href in hrefs:
            this_top_domain = UrlUtil.get_top_domain(href)
            if request_top_domain == this_top_domain and UrlUtil.get_url_suffix(
                    href) != "js":
                inner_chains.add(href)
            elif this_top_domain not in self.safe_chains and not UrlUtil.is_gov_or_edu(
                    href):
                # 主域名不在白名单里而且不是政府或教育机构网站
                unknown_domains.add(this_top_domain)

        # 将须迭代的内链包装对象放入redis
        logging.info("Length of inner_chains is " + str(len(inner_chains)))
        dup_set_name = "engine:dup_set:" + str(self.downloader_item.task_id)
        queue_name = "engine:queue:" + str(self.downloader_item.task_id)
        for inner_chain in inner_chains:
            if isinstance(self.redis_conn.ttl(dup_set_name), int):
                sadd_re = self.redis_conn.sadd(dup_set_name, inner_chain)
                if sadd_re == 1:  # 等于1说明上条插入成功,没有重复,省了一次查重
                    new_main_item = MainItem(
                        inner_chain,
                        refer=self.downloader_item.final_url,
                        task_id=self.downloader_item.task_id,
                        deepth=self.downloader_item.deepth + 1)
                    self.redis_conn.lpush(
                        queue_name,
                        json.dumps(new_main_item, default=main_item_to_json))
        # 将可疑外链存库
        for unknown_domain in unknown_domains:
            with self.connection.cursor() as cursor:
                sql = "SELECT mydomain FROM malicious_domains;"
                cursor.execute(sql)
                malicious_records = cursor.fetchall()
            malicious_domains = set([
                malicious_record["mydomain"]
                for malicious_record in malicious_records
            ])
            if unknown_domain in malicious_domains:
                suspicious_item = SuspiciousItem(
                    ss_id, unknown_domain, 1, 1,
                    time.strftime('%Y-%m-%d %H:%M:%S',
                                  time.localtime(time.time())))
            else:
                suspicious_item = SuspiciousItem(ss_id, unknown_domain, 0, -1,
                                                 None)
            with self.connection.cursor() as cursor:
                sql = 'INSERT INTO suspicious_records (ss_id, unknown_domain, checked, result, ' \
                      'check_time) VALUES (%s, %s, %s, %s, %s)'
                result = cursor.execute(sql, suspicious_item.save_tuple())
                if result != 1:
                    logging.error("suspicious_records插入记录" +
                                  suspicious_item.save_tuple() + "失败!")

        self.connection.commit()
        self.connection.close()
        logging.info(self.downloader_item.request_url + " parse over.")