def get_unsafe_out_chains(cls, format_html, top_domain): out_chains = HtmlUtil.get_out_chains(format_html, top_domain) unsafe_out_chains = set() # 过滤一下安全外链 safe_chains = set() connection = pymysql.connect(**projectconfig.mysql_config) # 拿到公共安全外链主域名 with connection.cursor() as cursor: sql = 'SELECT mydomain FROM public_safe_out_chains;' cursor.execute(sql) pubsocs = cursor.fetchall() for pubsoc in pubsocs: safe_chains.add(pubsoc["mydomain"]) # 拿到私有安全外链主域名 with connection.cursor() as cursor: sql = 'SELECT mydomain FROM private_safe_out_chains WHERE owner=%s;' cursor.execute(sql, (top_domain,)) pubsocs = cursor.fetchall() for pubsoc in pubsocs: safe_chains.add(pubsoc["mydomain"]) for out_chain in out_chains: if UrlUtil.get_top_domain(out_chain) not in safe_chains and not UrlUtil.is_gov_or_edu(out_chain): # 主域名不在白名单里而且不是政府或教育机构网站 unsafe_out_chains.add(out_chain) return unsafe_out_chains
def __init__(self, start_url, exist_time=600, deepth=2, max_num=200): """ :param start_url: 起始url :param exist_time: 任务最长持续时间 :param deepth: 页面收集深度 :param max_num: 最多收集页面 """ self.main_item = MainItem(start_url) self.main_item.task_id = str(uuid.uuid1()) # 一次收集任务的标识 self.main_item.refer = "" self.main_item.deepth = 1 # 起始深度当然是1 self.start_url = start_url self.top_domain = UrlUtil.get_top_domain(start_url) self.exist_time = int(exist_time) self.deepth = int(deepth) self.max_num = int(max_num)
def get_out_chains(cls, format_html, top_domain): """获取页面中的外链""" tree = etree.HTML(format_html) hrefs = tree.xpath("//@href") # 拿到所有a标签中的链接对象 iframes = tree.xpath("//iframe/@src") # 拿到所有iframe的源链接 jss = tree.xpath("//script/@src") # 拿到所有的js链接 hrefs.extend(iframes) hrefs.extend(jss) if hrefs: hrefs = href_clean(hrefs) else: hrefs = list() out_chains = set() for href in hrefs: if not UrlUtil.get_top_domain(href) == top_domain: out_chains.add(href) return out_chains
def diff_out_chains(cls, htmls, urls): """ 多个页面中不同的不安全外链 htmls: html列表 urls: 与html列表对应的url列表 return: 各个html中存在的独有的外链,最后一项是所有外链集合的差集 """ out_chainss = list() for html, url in zip(htmls, urls): if html is None: logging.error("None object has no out chains!") return [] format_html = HtmlUtil.get_format_html(html, url) out_chains = HtmlUtil.get_unsafe_out_chains(format_html, UrlUtil.get_top_domain(url)) out_chainss.append(out_chains) # 并集减交集为差集 diff = get_union(out_chainss) - get_intersection(out_chainss) result = list() for out_chains in out_chainss: result.append(list(out_chains & diff)) result.append(list(diff)) return result
def __init__(self, downloader_item): self.downloader_item = downloader_item self.connection = pymysql.connect( **projectconfig.mysql_config) # 建立数据库链接 self.redis_conn = redis.Redis.from_url(projectconfig.REDIS_URL) self.safe_chains = set() # 拿到公共安全外链主域名 with self.connection.cursor() as cursor: sql = 'SELECT mydomain FROM public_safe_out_chains;' cursor.execute(sql) pubsocs = cursor.fetchall() for pubsoc in pubsocs: self.safe_chains.add(pubsoc["mydomain"]) # 拿到私有安全外链主域名 with self.connection.cursor() as cursor: sql = 'SELECT mydomain FROM private_safe_out_chains WHERE owner=%s;' request_top_domain = UrlUtil.get_top_domain( downloader_item.request_url) cursor.execute(sql, (request_top_domain, )) pubsocs = cursor.fetchall() for pubsoc in pubsocs: self.safe_chains.add(pubsoc["mydomain"])
def get_union(sets): """取得多个集合的并集""" tmp = sets.copy() re = tmp.pop() for s in sets: re = s | re return re def get_intersection(sets): """取得多个集合的交集""" tmp = sets.copy() re = tmp.pop() for s in sets: re = s & re return re if __name__ == "__main__": # html1, url1 = get_html_from_mysql(464) # html2, url2 = get_html_from_mysql(465) # html3, url3 = get_html_from_mysql(466) # import json # print(json.dumps(HtmlUtil.diff_out_chains(htmls=[html1, html2, html3], urls=[url1, url2, url3]), indent=4)) html, url = get_html_from_mysql(504) format_html = HtmlUtil.get_format_html(html, url) my_out_chains = HtmlUtil.get_out_chains(format_html, UrlUtil.get_top_domain(url)) with open("check504_ex.txt", "w", encoding="utf-8") as f: for my_out_chain in my_out_chains: f.write(my_out_chain + " ==> " + UrlUtil.get_top_domain(my_out_chain) + "\n")
def parse(self): if not isinstance(self.downloader_item, MainItem): logging.error("The param type is: " + str(type(self.downloader_item)) + ", but it should be MainItem.") return None html = self.downloader_item.html # 将downloader_item存库 with self.connection.cursor() as cursor: sql = 'INSERT INTO snapshot (request_url, final_url, load_time, refer, get_time,' \ ' task_id, send_ip, server_ip, deepth) VALUES (%s, %s, %s, %s, %s, %s, %s, ' \ '%s, %s);' result = cursor.execute(sql, self.downloader_item.save_tuple()) if result != 1: logging.error("snapshot插入记录" + self.downloader_item.save_tuple() + "失败!") # 拿到刚刚存库记录的id with self.connection.cursor() as cursor: sql = 'SELECT last_insert_id() as ss_id;' cursor.execute(sql) result = cursor.fetchone() ss_id = result["ss_id"] # 将页面内容存库 ss_html = SsHtmlItem(ss_id=ss_id, html=html) with self.connection.cursor() as cursor: sql = 'INSERT INTO ss_html (ss_id, html) VALUES (%s, %s);' result = cursor.execute(sql, ss_html.save_tuple()) if result != 1: logging.error("ss_html插入记录" + ss_html.save_tuple() + "失败!") # 规范化一下页面内链接 final_protocol = UrlUtil.get_protocol(self.downloader_item.final_url) final_domain = UrlUtil.get_domain(self.downloader_item.final_url) format_html = HtmlUtil.parse_protocol(html, final_protocol) format_html = HtmlUtil.parse_domain(format_html, final_protocol, final_domain) tree = etree.HTML(format_html) hrefs = tree.xpath("//@href") # 拿到所有a标签中的链接对象 iframes = tree.xpath("//iframe/@src") # 拿到所有iframe的源链接 jss = tree.xpath("//script/@src") # 拿到所有的js链接 hrefs.extend(iframes) hrefs.extend(jss) if hrefs: hrefs = href_clean(hrefs) else: hrefs = list() inner_chains = set() # 内链列表,返回给引擎迭代 unknown_domains = set() # 可疑外链主域名列表,存库人工复查 request_top_domain = UrlUtil.get_top_domain( self.downloader_item.request_url) for href in hrefs: this_top_domain = UrlUtil.get_top_domain(href) if request_top_domain == this_top_domain and UrlUtil.get_url_suffix( href) != "js": inner_chains.add(href) elif this_top_domain not in self.safe_chains and not UrlUtil.is_gov_or_edu( href): # 主域名不在白名单里而且不是政府或教育机构网站 unknown_domains.add(this_top_domain) # 将须迭代的内链包装对象放入redis logging.info("Length of inner_chains is " + str(len(inner_chains))) dup_set_name = "engine:dup_set:" + str(self.downloader_item.task_id) queue_name = "engine:queue:" + str(self.downloader_item.task_id) for inner_chain in inner_chains: if isinstance(self.redis_conn.ttl(dup_set_name), int): sadd_re = self.redis_conn.sadd(dup_set_name, inner_chain) if sadd_re == 1: # 等于1说明上条插入成功,没有重复,省了一次查重 new_main_item = MainItem( inner_chain, refer=self.downloader_item.final_url, task_id=self.downloader_item.task_id, deepth=self.downloader_item.deepth + 1) self.redis_conn.lpush( queue_name, json.dumps(new_main_item, default=main_item_to_json)) # 将可疑外链存库 for unknown_domain in unknown_domains: with self.connection.cursor() as cursor: sql = "SELECT mydomain FROM malicious_domains;" cursor.execute(sql) malicious_records = cursor.fetchall() malicious_domains = set([ malicious_record["mydomain"] for malicious_record in malicious_records ]) if unknown_domain in malicious_domains: suspicious_item = SuspiciousItem( ss_id, unknown_domain, 1, 1, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) else: suspicious_item = SuspiciousItem(ss_id, unknown_domain, 0, -1, None) with self.connection.cursor() as cursor: sql = 'INSERT INTO suspicious_records (ss_id, unknown_domain, checked, result, ' \ 'check_time) VALUES (%s, %s, %s, %s, %s)' result = cursor.execute(sql, suspicious_item.save_tuple()) if result != 1: logging.error("suspicious_records插入记录" + suspicious_item.save_tuple() + "失败!") self.connection.commit() self.connection.close() logging.info(self.downloader_item.request_url + " parse over.")
def parse_by_task_id(cls, task_id): connection = pymysql.connect(**projectconfig.mysql_config) # 建立数据库链接 # 读黑名单 sql = "SELECT mydomain FROM malicious_domains;" with connection.cursor() as cursor: cursor.execute(sql) malicious_records = cursor.fetchall() malicious_domains = set([ malicious_record["mydomain"] for malicious_record in malicious_records ]) sql = "SELECT id,request_url FROM snapshot WHERE task_id=%s;" with connection.cursor() as cursor: cursor.execute(sql, (task_id, )) items = cursor.fetchall() urls = dict() # 用url作键值,将同一url不同地区的下载结果id聚类 for item in items: id = item["id"] request_url = item["request_url"] if request_url not in urls.keys(): urls[request_url] = list([ id, ]) else: urls[request_url].append(id) for url in urls.keys(): htmls = list() for html_id in urls.get(url): html, final_url = get_html_from_mysql(html_id=html_id) format_html = HtmlUtil.get_format_html(html=html, final_url=final_url) htmls.append(format_html) diff_out_chains = HtmlUtil.diff_out_chains_from_same_url( htmls=htmls, url=url) for i in range(0, len(urls.get(url))): sql = "INSERT INTO private_out_chain_records (ss_id, out_chain, checked, result, check_time) " \ "VALUES (%s, %s, %s, %s, %s)" for diff_out_chain in diff_out_chains[i]: with connection.cursor() as cursor: if UrlUtil.get_top_domain( diff_out_chain) in malicious_domains: private_out_chain_record_item = PrivateOutChainRecordItem( urls.get(url)[i], diff_out_chain, 1, 1, None) else: private_out_chain_record_item = PrivateOutChainRecordItem( urls.get(url)[i], diff_out_chain, 0, -1, None) result = cursor.execute( sql, private_out_chain_record_item.save_tuple()) if result != 1: logging.error( "private_out_chain_records插入记录" + private_out_chain_record_item.save_tuple() + "失败!") logging.info("url: " + url + " compare over.") with connection.cursor() as cursor: sql = "UPDATE download_tasks SET compared=1 WHERE task_id=%s;" re = cursor.execute(sql, (task_id, )) if re != 1: logging.error("Update table download_tasks failed!") connection.commit() connection.close()