示例#1
0
    async def __http_request__async(self, url, session):
        """ Sends asynchronous http request to URL address and scrapes the webpage. """
        try:
            while True:
                async with session.get(
                        url, headers={'User-Agent':
                                      generate_user_agent()}) as response:
                    page_html = await response.read()

                if page_html.startswith(b'Too many requests'):
                    time.sleep(0.01)
                    continue
                break

            if self.cssselect is True:
                return self.scrape_function(html.fromstring(page_html),
                                            url=url,
                                            *self.arguments)
            else:
                return self.scrape_function(page_html,
                                            url=url,
                                            *self.arguments)

        except (asyncio.TimeoutError, requests.exceptions.Timeout):
            raise ConnectionTimeout(url)
示例#2
0
def http_request_get(url, session=None, payload=None, parse=True):
    """ Sends a GET HTTP request to a website and returns its HTML content and full url address. """

    if payload is None:
        payload = {}

    try:
        if session:
            content = session.get(
                url,
                params=payload,
                verify=False,
                headers={'User-Agent': generate_user_agent()})
        else:
            content = requests.get(
                url,
                params=payload,
                verify=False,
                headers={'User-Agent': generate_user_agent()})

        content.raise_for_status(
        )  # Raise HTTPError for bad requests (4xx or 5xx)
        if parse:
            return html.fromstring(content.text), content.url
        else:
            return content.text, content.url
    except (asyncio.TimeoutError, requests.exceptions.Timeout):
        raise ConnectionTimeout(url)
示例#3
0
    async def __http_request__async(self, url: str, session: aiohttp.ClientSession, user_agent: str):
        """ Sends asynchronous http request to URL address and scrapes the webpage. """

        try:
            async with session.get(url, headers={'User-Agent': user_agent}) as response:
                page_html = await response.read()

                if self.css_select:
                    return self.scrape_function(html.fromstring(page_html), *self.arguments)
                return self.scrape_function(page_html, *self.arguments)
        except (asyncio.TimeoutError, requests.exceptions.Timeout):
            raise ConnectionTimeout(url)