async def __http_request__async(self, url, session): """ Sends asynchronous http request to URL address and scrapes the webpage. """ try: while True: async with session.get( url, headers={'User-Agent': generate_user_agent()}) as response: page_html = await response.read() if page_html.startswith(b'Too many requests'): time.sleep(0.01) continue break if self.cssselect is True: return self.scrape_function(html.fromstring(page_html), url=url, *self.arguments) else: return self.scrape_function(page_html, url=url, *self.arguments) except (asyncio.TimeoutError, requests.exceptions.Timeout): raise ConnectionTimeout(url)
def http_request_get(url, session=None, payload=None, parse=True): """ Sends a GET HTTP request to a website and returns its HTML content and full url address. """ if payload is None: payload = {} try: if session: content = session.get( url, params=payload, verify=False, headers={'User-Agent': generate_user_agent()}) else: content = requests.get( url, params=payload, verify=False, headers={'User-Agent': generate_user_agent()}) content.raise_for_status( ) # Raise HTTPError for bad requests (4xx or 5xx) if parse: return html.fromstring(content.text), content.url else: return content.text, content.url except (asyncio.TimeoutError, requests.exceptions.Timeout): raise ConnectionTimeout(url)
async def __http_request__async(self, url: str, session: aiohttp.ClientSession, user_agent: str): """ Sends asynchronous http request to URL address and scrapes the webpage. """ try: async with session.get(url, headers={'User-Agent': user_agent}) as response: page_html = await response.read() if self.css_select: return self.scrape_function(html.fromstring(page_html), *self.arguments) return self.scrape_function(page_html, *self.arguments) except (asyncio.TimeoutError, requests.exceptions.Timeout): raise ConnectionTimeout(url)