async def cache_owllook_novels_content(url, netloc): async with aiohttp.ClientSession() as client: html = await target_fetch(client=client, url=url) if html: soup = BeautifulSoup(html, 'html5lib') selector = RULES[netloc].content_selector if selector.get('id', None): content = soup.find_all(id=selector['id']) elif selector.get('class', None): content = soup.find_all(class_=selector['class']) else: content = soup.find_all(selector.get('tag')) if content: # 提取出真正的章节标题 title_reg = r'(第?\s*[一二两三四五六七八九十○零百千万亿0-91234567890]{1,6}\s*[章回卷节折篇幕集]\s*.*?)[_,-]' title = soup.title.string extract_title = re.findall(title_reg, title, re.I) title = extract_title[0] if extract_title else title # if "_" in title: # title = title.split('_')[0] # elif "-" in title: # title = title.split('-')[0] next_chapter = extract_pre_next_chapter(chapter_url=url, html=str(soup)) data = { 'content': str(content), 'next_chapter': next_chapter, 'title': title } else: data = None return data return None
async def cache_owllook_novels_content(url, chapter_url, netloc): headers = {'user-agent': await get_random_user_agent()} html = await target_fetch(headers=headers, url=url) if not html: html = get_html_by_requests(url=url, headers=headers) if html: soup = BeautifulSoup(html, 'html5lib') selector = RULES[netloc].content_selector if selector.get('id', None): content = soup.find_all(id=selector['id']) elif selector.get('class', None): content = soup.find_all(class_=selector['class']) else: content = soup.find_all(selector.get('tag')) if content: # 提取出真正的章节标题 title_reg = r'(第?\s*[一二两三四五六七八九十○零百千万亿0-91234567890]{1,6}\s*[章回卷节折篇幕集]\s*.*?)[_,-]' title = soup.title.string extract_title = re.findall(title_reg, title, re.I) if extract_title: title = extract_title[0] else: title = soup.select('h1')[0].get_text() if not title: title = soup.title.string # if "_" in title: # title = title.split('_')[0] # elif "-" in title: # title = title.split('-')[0] next_chapter = extract_pre_next_chapter(url=url, chapter_url=chapter_url, html=str(soup)) content = [str(i) for i in content] data = { 'content': str(''.join(content)), 'next_chapter': next_chapter, 'title': title } else: data = None return data return None