Exemplo n.º 1
0
def async_get(event_loop):
    """ AsyncSession cannot be created global since it will create
        a different loop from pytest-asyncio. """
    async_session = AsyncHTMLSession()
    async_session.mount('file://', FileAdapter())
    path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html'))
    url = 'file://{}'.format(path)

    return partial(async_session.get, url)
def async_get(event_loop):
    """ AsyncSession cannot be created global since it will create
        a different loop from pytest-asyncio. """
    async_session = AsyncHTMLSession()
    async_session.mount('file://', FileAdapter())
    path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html'))
    url = 'file://{}'.format(path)

    return partial(async_session.get, url)
Exemplo n.º 3
0
    async def __get_vm(self, stop: str) -> HTMLResponse:
        url = '{}/?przystanek={}'.format(Scrapper.VM_URL, stop)
        session = AsyncHTMLSession()
        adapter = adapters.HTTPAdapter(max_retries=5)
        session.mount('http://', adapter)
        session.mount('https://', adapter)

        result = None
        while not isinstance(result, HTMLResponse):
            proxy = self.__get_next_proxy()
            try:
                result = await session.get(url,
                                           proxies=Scrapper.__proxies(proxy),
                                           timeout=Scrapper.GET_TIMEOUT)
            except RequestException:
                if len(self.proxies) == 1:
                    raise Exception('no working proxy available')
                pass

        await session.close()
        return result
Exemplo n.º 4
0
import requests
import re
import json

from dataclasses import dataclass
from urllib.parse import urljoin
from urllib.parse import urlparse
from requests_html import AsyncHTMLSession, HTML
from utils import PASS_DOMAIN, geuss_link_url, rm_slash, has_url_html_been_fetched
from itertools import chain
from multiprocessing import cpu_count, Pool, Manager, Queue, TimeoutError
from site_feature import SiteFeatureTransformer

asession = AsyncHTMLSession()
adapter = requests.adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100)
asession.mount('http://', adapter)
asession.mount('https://', adapter)


def save_html(domain, html):
    path = os.path.join('html', f'{domain}.html')
    with open(path, 'w') as f:
        f.write(html)


def get_data(urls, is_zh_i9t_blog=False):
    res = get_frineds_and_res(urls, is_zh_i9t_blog)
    data = []
    for url, friends, r in res:
        site_feature = SiteFeatureTransformer(r=r,
                                              url=url,