def __dl_in_element_style_img(self, soup, url): """ 获取到html页面内嵌样式的图片资源 <xx style='background: url(xxxxx.jpg)'> :param soup: :param url: :return: """ inner_style_node = soup.find_all( style=re.compile("url(.*?)")) # TODO url/URL 大小写 for style in inner_style_node: resource_url = re.findall( 'url\(.*?\)', style.get("style"))[0] # TODO 遍历而非取第一个,匹配到全部 resource_url = self.__get_style_url_link(resource_url) if is_inline_resource(resource_url): # 内嵌base64图片 continue abs_link = get_abs_url(url, resource_url) if self.is_ref_model: style['style'] = style['style'].replace(resource_url, abs_link) elif is_same_web_site_link(url, abs_link) or self.is_grab_outer_link: file_name = get_file_name_from_url(abs_link, self.file_name_dup_checker, 'png') file_save_path = f"{self.__get_img_full_path()}/{file_name}" replace_url = f"{self.img_dir}/{file_name}" replace_url = replace_url style['style'] = style['style'].replace( resource_url, replace_url) self.__url_enqueue(abs_link, file_save_path, self.FILE_TYPE_BIN) else: style['style'] = style['style'].replace(resource_url, abs_link)
def __get_same_site_link(self, soup, url): """ 获取soup里的全部页面的url :param soup: :param url: :return: """ new_url = [] if self.is_full_site: a_list = soup.find_all("a") if a_list is not None: for a in a_list: try: raw_link = a.get("href") if raw_link is None or raw_link.startswith( "#") or not is_page_url(raw_link): continue abs_link = get_abs_url(url, raw_link) #新产生的url abs_link = format_url(abs_link) a['href'] = abs_link if is_page_url(abs_link) and not is_img_ext( abs_link) and abs_link not in new_url: new_url.append(abs_link) except Exception as e: self.logger.info("%s: %s", a, e) self.logger.exception(e) continue return new_url
def __dl_img(self, soup, url): """ 下载图片,并替换html里图片的地址 :param soup: :param url: :return: """ images = soup.find_all("img") for img in images: raw_link = img.get("src") if raw_link is None or is_inline_resource( raw_link): # 跳过base64内嵌图片 <img src='data:image...'/> continue abs_link = get_abs_url(url, raw_link) if self.is_ref_model: img['src'] = abs_link elif is_same_web_site_link(url, abs_link) or self.is_grab_outer_link: file_name = get_file_name_from_url(abs_link, self.file_name_dup_checker, "png") file_save_path = f"{self.__get_img_full_path()}/{file_name}" replace_url = f"{self.img_dir}/{file_name}" img['src'] = replace_url self.__url_enqueue(abs_link, file_save_path, self.FILE_TYPE_BIN) else: # 修正为绝对链接 img['src'] = abs_link if img.get("crossorigin ") is not None: del img['crossorigin '] if img.get("integrity") is not None: del img['integrity']
async def __dl_link(self, soup, url): """ 下载<link>标签里的资源,并替换html里的地址 :param soup: :param url: :return: """ css_src = soup.find_all("link") if css_src is None: return for css in css_src: raw_link = css.get("href") if raw_link is None: continue abs_link = get_abs_url(url, raw_link) if self.is_ref_model: css['href'] = abs_link elif is_same_web_site_link( url, abs_link) or self.is_grab_outer_link: # 控制是否抓外链资源 file_name = get_file_name_from_url(abs_link, self.file_name_dup_checker, 'css') if is_img_ext(file_name): file_save_path = f"{self.__get_img_full_path()}/{file_name}" replace_url = f"{self.img_dir}/{file_name}" self.__url_enqueue(abs_link, file_save_path, self.FILE_TYPE_BIN) else: file_save_path = f"{self.__get_css_full_path()}/{file_name}" replace_url = f"{self.css_dir}/{file_name}" if not self.__is_dup(abs_link, file_save_path): resp_text, _ = await self.__async_get_request_text( abs_link, force_as_text=True) if resp_text is not None: text_content = resp_text text_content = await self.__replace_and_grab_css_url( abs_link, text_content) self.__set_dup_url(abs_link, file_save_path) await self.__async_save_text_file( text_content, file_save_path) # 存储css文件 else: self.__log_error_resource(abs_link, file_save_path) css['href'] = replace_url else: # 修正link url为绝对链接地址 css['href'] = abs_link # 将跨域锁定和来源校验关闭 if css.get("crossorigin") is not None: del css['crossorigin'] if css.get('integrity') is not None: del css['integrity']
async def __process_in_html_css_resource(self, soup, url): style_css = soup.find_all("style") if style_css is None: return for style in style_css: css_content = style.text if css_content is None: continue all_urls = re.findall('url\(.*?\)', css_content) if all_urls is not None: for raw_u in all_urls: u = self.__get_style_url_link( raw_u) # > url('xxx') or url(xxx) if is_inline_resource(u): # 内嵌base64图片 continue abs_link = get_abs_url(url, u) if self.is_ref_model: css_content = css_content.replace( raw_u, f'url({abs_link})') elif is_same_web_site_link( url, abs_link) or self.is_grab_outer_link: file_name = get_file_name_from_url( abs_link, self.file_name_dup_checker) if is_img_ext(file_name): file_save_path = f"{self.__get_img_full_path()}/{file_name}" replace_url = f"{self.img_dir}/{file_name}" else: file_save_path = f"{self.__get_css_full_path()}/{file_name}" replace_url = f"{self.css_dir}/{file_name}" css_content = css_content.replace( raw_u, f'url({replace_url})') self.__url_enqueue(abs_link, file_save_path, self.FILE_TYPE_BIN) else: # 替换,特别是以//开头那种url css_content = css_content.replace( raw_u, f'url({abs_link})') style.string = css_content
def __dl_js(self, soup, url): """ 下载js,替换html里js文件的地址 :param soup: :param url: :return: """ scripts_urls = soup.find_all("script") for scripts in scripts_urls: raw_link = scripts.get("src") if raw_link is None: continue abs_link = get_abs_url(url, raw_link) if self.is_ref_model: scripts['src'] = abs_link elif is_same_web_site_link(url, abs_link) or self.is_grab_outer_link: """ 如果是外链引入的js就不管了,除非打开了开关 """ file_name = get_file_name_from_url(abs_link, self.file_name_dup_checker, "js") file_save_path = f"{self.__get_js_full_path()}/{file_name}" replace_url = f"{self.js_dir}/{file_name}" scripts['src'] = replace_url self.__url_enqueue(abs_link, file_save_path, self.FILE_TYPE_TEXT) # 将跨域锁定和来源校验关闭 if scripts.get("crossorigin") is not None: del scripts['crossorigin'] if scripts.get('integrity') is not None: del scripts['integrity'] else: scripts['src'] = abs_link
async def __replace_and_grab_css_url(self, url, text): """ @import url(font-awesome.min.css); @import "https://fonts.googleapis.com/css?family=Montserrat:700|Open+Sans:300,400|Pacifico"; :param url: :param text: :return: """ urls = re.findall("url\(.*?\)", text) # TODO 区分大小写 for u in urls: relative_u = self.__get_style_url_link(u) if is_inline_resource(relative_u): # 内嵌base64图片 continue abs_link = get_abs_url(url, relative_u) if relative_u.endswith( "css"): # 这一步把带有 @import url(xx)的忽略掉,防止和下一步重合 self.logger.warning("skip css file, grab in the next step: %s", abs_link) continue if self.is_grab_outer_link: # 控制是否抓外链资源,只要抓外部资源,那么css里的全部资源都要无条件抓进来而不管是不是一个同站点的 file_name = get_file_name_from_url(abs_link, self.file_name_dup_checker, 'css') is_img = is_img_ext(file_name) if is_img: file_save_path = f"{self.__get_img_full_path()}/{file_name}" replace_url = f"../{self.img_dir}/{file_name}" self.__url_enqueue(abs_link, file_save_path, self.FILE_TYPE_BIN) text = text.replace(relative_u, replace_url) else: file_save_path = f"{self.__get_css_full_path()}/{file_name}" replace_url = f"{file_name}" # 由于是相对于css文件的引入,因此是平级关系, 如果是图片就需要从../img目录下 self.__url_enqueue(abs_link, file_save_path, self.FILE_TYPE_BIN) text = text.replace(relative_u, replace_url) imported_css = re.findall('@import\s+["\']+(.*?)["\']', text) # css里 @import的情况 imported_css2 = re.findall( '@import\s+url\(.*?\)', text) # > ['@import url(font-awesome.min.css)'] if imported_css2 is not None: for x in imported_css2: x = self.__get_style_url_link(x.split()[1]) imported_css.append(x) if imported_css: for u in imported_css: if u.startswith(("http", "https")): abs_link = u else: abs_link = get_abs_url(url, u) file_name = get_file_name_from_url(abs_link, self.file_name_dup_checker, 'css') file_save_path = f"{self.__get_css_full_path()}/{file_name}" if not self.__is_dup(abs_link, file_save_path): resp_text, _ = await self.__async_get_request_text(abs_link ) if resp_text is not None: text_content = resp_text text_content = await self.__replace_and_grab_css_url( abs_link, text_content) self.__set_dup_url(abs_link, file_save_path) # 在存储之前要把返回的css里的@import后面内容替换掉 text_content = text_content.replace(u, file_name) await self.__async_save_text_file( text_content, file_save_path) # 存储css文件 else: self.__log_error_resource(url, file_save_path) # 下载失败的 return text