Python get_html_by_requests 예제들, owllook.fetcher.function.get_html_by_requests Python 예제들

예제 #1

0

파일 보기

파일: cache.py 프로젝트: wintersyan/owllook

async def cache_owllook_novels_chapter(url, netloc):
    headers = {'user-agent': await get_random_user_agent()}
    html = await target_fetch(headers=headers, url=url)
    if not html:
        html = get_html_by_requests(url=url, headers=headers)
    if html:
        soup = BeautifulSoup(html, 'html5lib')
        selector = RULES[netloc].chapter_selector
        if selector.get('id', None):
            content = soup.find_all(id=selector['id'])
        elif selector.get('class', None):
            content = soup.find_all(class_=selector['class'])
        else:
            content = soup.find_all(selector.get('tag'))
        # 防止章节被display:none
        return str(content).replace('style', '') if content else None
    return None

예제 #2

0

파일 보기

파일: cache.py 프로젝트: wintersyan/owllook

async def cache_owllook_novels_content(url, chapter_url, netloc):
    headers = {'user-agent': await get_random_user_agent()}
    html = await target_fetch(headers=headers, url=url)
    if not html:
        html = get_html_by_requests(url=url, headers=headers)
    if html:
        soup = BeautifulSoup(html, 'html5lib')
        selector = RULES[netloc].content_selector
        if selector.get('id', None):
            content = soup.find_all(id=selector['id'])
        elif selector.get('class', None):
            content = soup.find_all(class_=selector['class'])
        else:
            content = soup.find_all(selector.get('tag'))
        if content:
            # 提取出真正的章节标题
            title_reg = r'(第?\s*[一二两三四五六七八九十○零百千万亿0-9１２３４５６７８９０]{1,6}\s*[章回卷节折篇幕集]\s*.*?)[_,-]'
            title = soup.title.string
            extract_title = re.findall(title_reg, title, re.I)
            if extract_title:
                title = extract_title[0]
            else:
                title = soup.select('h1')[0].get_text()
            if not title:
                title = soup.title.string
            # if "_" in title:
            #     title = title.split('_')[0]
            # elif "-" in title:
            #     title = title.split('-')[0]
            next_chapter = extract_pre_next_chapter(url=url,
                                                    chapter_url=chapter_url,
                                                    html=str(soup))
            content = [str(i) for i in content]
            data = {
                'content': str(''.join(content)),
                'next_chapter': next_chapter,
                'title': title
            }
        else:
            data = None
        return data
    return None

예제 #3

0

파일 보기

파일: cache.py 프로젝트: pol9111/owllook

async def get_the_latest_chapter(chapter_url, loop=None, timeout=15):
    try:
        with async_timeout.timeout(timeout):
            url = parse_qs(urlparse(chapter_url).query).get('url', '')
            novels_name = parse_qs(urlparse(chapter_url).query).get(
                'novels_name', '')
            data = None
            if url and novels_name:
                url = url[0]
                novels_name = novels_name[0]
                netloc = urlparse(url).netloc
                if netloc in LATEST_RULES.keys():
                    async with aiohttp.ClientSession(loop=loop) as client:
                        headers = {'user-agent': await get_random_user_agent()}
                        try:
                            html = await target_fetch(client=client,
                                                      url=url,
                                                      headers=headers,
                                                      timeout=timeout)
                            if html is None:
                                html = get_html_by_requests(url=url,
                                                            headers=headers,
                                                            timeout=timeout)
                        except TypeError:
                            html = get_html_by_requests(url=url,
                                                        headers=headers,
                                                        timeout=timeout)
                        except Exception as e:
                            LOGGER.exception(e)
                            return None
                        try:
                            soup = BeautifulSoup(html, 'html5lib')
                        except Exception as e:
                            LOGGER.exception(e)
                            return None
                        latest_chapter_name, latest_chapter_url = None, None
                        if LATEST_RULES[netloc].plan:
                            meta_value = LATEST_RULES[netloc].meta_value
                            latest_chapter_name = soup.select(
                                'meta[property="{0}"]'.format(
                                    meta_value["latest_chapter_name"]))
                            latest_chapter_name = latest_chapter_name[0].get(
                                'content',
                                None) if latest_chapter_name else None
                            latest_chapter_url = soup.select(
                                'meta[property="{0}"]'.format(
                                    meta_value["latest_chapter_url"]))
                            latest_chapter_url = urljoin(
                                url, latest_chapter_url[0].get(
                                    'content',
                                    None)) if latest_chapter_url else None
                        else:
                            selector = LATEST_RULES[netloc].selector
                            content_url = selector.get('content_url')
                            if selector.get('id', None):
                                latest_chapter_soup = soup.find_all(
                                    id=selector['id'])
                            elif selector.get('class', None):
                                latest_chapter_soup = soup.find_all(
                                    class_=selector['class'])
                            else:
                                latest_chapter_soup = soup.select(
                                    selector.get('tag'))
                            if latest_chapter_soup:
                                if content_url == '1':
                                    # TODO
                                    pass
                                elif content_url == '0':
                                    # TODO
                                    pass
                                else:
                                    latest_chapter_url = content_url + latest_chapter_soup[
                                        0].get('href', None)
                                latest_chapter_name = latest_chapter_soup[
                                    0].get('title', None)
                        if latest_chapter_name and latest_chapter_url:
                            time_current = get_time()
                            # print(latest_chapter_url)
                            data = {
                                "latest_chapter_name":
                                latest_chapter_name,
                                "latest_chapter_url":
                                latest_chapter_url,
                                "owllook_chapter_url":
                                chapter_url,
                                "owllook_content_url":
                                "/owllook_content?url={latest_chapter_url}&name={name}&chapter_url={chapter_url}&novels_name={novels_name}"
                                .format(
                                    latest_chapter_url=latest_chapter_url,
                                    name=latest_chapter_name,
                                    chapter_url=url,
                                    novels_name=novels_name,
                                ),
                            }
                            # 存储最新章节
                            motor_db = MotorBase().get_db()
                            await motor_db.latest_chapter.update_one(
                                {
                                    "novels_name": novels_name,
                                    'owllook_chapter_url': chapter_url
                                }, {
                                    '$set': {
                                        'data': data,
                                        "finished_at": time_current
                                    }
                                },
                                upsert=True)
            return data
    except Exception as e:
        LOGGER.exception(e)
        return None