def get_format_html(cls, html, final_url): """获得标准化的html""" final_protocol = UrlUtil.get_protocol(final_url) final_domain = UrlUtil.get_domain(final_url) format_html = HtmlUtil.parse_protocol(html, final_protocol) format_html = HtmlUtil.parse_domain(format_html, final_protocol, final_domain) return format_html
def parse(self): if not isinstance(self.downloader_item, MainItem): logging.error("The param type is: " + str(type(self.downloader_item)) + ", but it should be MainItem.") return None html = self.downloader_item.html # 将downloader_item存库 with self.connection.cursor() as cursor: sql = 'INSERT INTO snapshot (request_url, final_url, load_time, refer, get_time,' \ ' task_id, send_ip, server_ip, deepth) VALUES (%s, %s, %s, %s, %s, %s, %s, ' \ '%s, %s);' result = cursor.execute(sql, self.downloader_item.save_tuple()) if result != 1: logging.error("snapshot插入记录" + self.downloader_item.save_tuple() + "失败!") # 拿到刚刚存库记录的id with self.connection.cursor() as cursor: sql = 'SELECT last_insert_id() as ss_id;' cursor.execute(sql) result = cursor.fetchone() ss_id = result["ss_id"] # 将页面内容存库 ss_html = SsHtmlItem(ss_id=ss_id, html=html) with self.connection.cursor() as cursor: sql = 'INSERT INTO ss_html (ss_id, html) VALUES (%s, %s);' result = cursor.execute(sql, ss_html.save_tuple()) if result != 1: logging.error("ss_html插入记录" + ss_html.save_tuple() + "失败!") # 规范化一下页面内链接 final_protocol = UrlUtil.get_protocol(self.downloader_item.final_url) final_domain = UrlUtil.get_domain(self.downloader_item.final_url) format_html = HtmlUtil.parse_protocol(html, final_protocol) format_html = HtmlUtil.parse_domain(format_html, final_protocol, final_domain) tree = etree.HTML(format_html) hrefs = tree.xpath("//@href") # 拿到所有a标签中的链接对象 iframes = tree.xpath("//iframe/@src") # 拿到所有iframe的源链接 jss = tree.xpath("//script/@src") # 拿到所有的js链接 hrefs.extend(iframes) hrefs.extend(jss) if hrefs: hrefs = href_clean(hrefs) else: hrefs = list() inner_chains = set() # 内链列表,返回给引擎迭代 unknown_domains = set() # 可疑外链主域名列表,存库人工复查 request_top_domain = UrlUtil.get_top_domain( self.downloader_item.request_url) for href in hrefs: this_top_domain = UrlUtil.get_top_domain(href) if request_top_domain == this_top_domain and UrlUtil.get_url_suffix( href) != "js": inner_chains.add(href) elif this_top_domain not in self.safe_chains and not UrlUtil.is_gov_or_edu( href): # 主域名不在白名单里而且不是政府或教育机构网站 unknown_domains.add(this_top_domain) # 将须迭代的内链包装对象放入redis logging.info("Length of inner_chains is " + str(len(inner_chains))) dup_set_name = "engine:dup_set:" + str(self.downloader_item.task_id) queue_name = "engine:queue:" + str(self.downloader_item.task_id) for inner_chain in inner_chains: if isinstance(self.redis_conn.ttl(dup_set_name), int): sadd_re = self.redis_conn.sadd(dup_set_name, inner_chain) if sadd_re == 1: # 等于1说明上条插入成功,没有重复,省了一次查重 new_main_item = MainItem( inner_chain, refer=self.downloader_item.final_url, task_id=self.downloader_item.task_id, deepth=self.downloader_item.deepth + 1) self.redis_conn.lpush( queue_name, json.dumps(new_main_item, default=main_item_to_json)) # 将可疑外链存库 for unknown_domain in unknown_domains: with self.connection.cursor() as cursor: sql = "SELECT mydomain FROM malicious_domains;" cursor.execute(sql) malicious_records = cursor.fetchall() malicious_domains = set([ malicious_record["mydomain"] for malicious_record in malicious_records ]) if unknown_domain in malicious_domains: suspicious_item = SuspiciousItem( ss_id, unknown_domain, 1, 1, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) else: suspicious_item = SuspiciousItem(ss_id, unknown_domain, 0, -1, None) with self.connection.cursor() as cursor: sql = 'INSERT INTO suspicious_records (ss_id, unknown_domain, checked, result, ' \ 'check_time) VALUES (%s, %s, %s, %s, %s)' result = cursor.execute(sql, suspicious_item.save_tuple()) if result != 1: logging.error("suspicious_records插入记录" + suspicious_item.save_tuple() + "失败!") self.connection.commit() self.connection.close() logging.info(self.downloader_item.request_url + " parse over.")