def get_session(**kwargs) -> HTMLSession: session = HTMLSession() if kwargs['use_proxy']: session.proxies = {'http': 'rproxy:5566', 'https': 'rproxy:5566'} if not kwargs['default_header']: session.headers = get_headers() return session
def __init__(self, **kwargs): ''' Base class for common scraping tasks Args: ''' logging.getLogger(__name__).addHandler(logging.NullHandler()) self.urls = [] # use requests HTML to aid parsing # has all same methods as requests.Session _s = HTMLSession() # delay/expire if kwargs.get('delay'): self.delay = kwargs['delay'] else: self.delay = 2 if kwargs.get('expire_hours'): self.expire_hours = kwargs['expire_hours'] else: self.expire_hours = 168 # add cookies if kwargs.get('cookies'): _s.cookies = kwargs['cookies'] else: try: import cookielib _s.cookies = cookielib.MozillaCookieJar() except (NameError, ImportError): import http.cookiejar _s.cookies = http.cookiejar.MozillaCookieJar() # add headers if kwargs.get('headers'): _s.headers = kwargs['headers'] else: ua = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36') _s.headers = {'User-Agent': ua} # add proxies if kwargs.get('proxies'): _s.proxies = kwargs['proxies'] # add cache if not '/' in kwargs.get('cache_name', ''): self.cache_name = os.path.join('/tmp', kwargs['cache_name']) try: from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches import FileCache _s.mount('http://', CacheControlAdapter(cache=FileCache(self.cache_name), cache_etags = False, heuristic=ExpiresAfter(hours=self.expire_hours))) except ImportError as e: try: import requests_cache requests_cache.install_cache(self.cache_name) except: logging.exception('could not install cache') self.s = _s
def __init__(self, **kwargs): """ """ logging.getLogger(__name__).addHandler(logging.NullHandler()) self.urls = [] # use requests HTML to aid parsing # has all same methods as requests.Session _s = HTMLSession() self.delay = kwargs.get("delay", 2) self.expire_hours = kwargs.get("expire_hours", 168) # add cookies if kwargs.get("cookies"): _s.cookies = kwargs["cookies"] else: import http.cookiejar _s.cookies = http.cookiejar.MozillaCookieJar() # add headers default_headers = { "User-Agent": random.choice(USER_AGENTS), "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9", "accept": "application/json, text/plain, */*", } _s.headers.update(default_headers) if kwargs.get("headers"): _s.headers.update(kwargs["headers"]) # add proxies if kwargs.get("proxies"): _s.proxies = kwargs["proxies"] # add cache if not kwargs.get("cache_name"): self.cache_name = os.path.join("/tmp", random_string(32)) elif "/" not in kwargs.get("cache_name", ""): self.cache_name = os.path.join("/tmp", kwargs["cache_name"]) else: self.cache_name = kwargs.get("cache_name") try: from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches import FileCache _s.mount( "http://", CacheControlAdapter( cache=FileCache(self.cache_name), cache_etags=False, heuristic=ExpiresAfter(hours=self.expire_hours), ), ) except ImportError: try: import requests_cache requests_cache.install_cache(self.cache_name) except BaseException: logging.exception("could not install cache") self.session = _s
def auth_html(self, order_id: str): count: int = 0 session = HTMLSession() session.proxies = self.proxy_worker.get_proxy_dict() session.headers = self.headers_work.get_headers() cookies = self.cookies_work.get_cookies() while count < self.number_attempts: try: response = session.get(settings.LOGIN_PAGE, cookies=cookies) response.html.render() data = response.html.html except requests.exceptions.ConnectionError as error: self._send_task_report("target_connect_error", data={ "message": error.__repr__(), "code": '', "order": order_id }) return { "status": False, "error": True, "status_code": '0', "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } try: response.raise_for_status() except requests.HTTPError as error: if response.status_code == 403: if self.is_update_proxy: # update proxy server settings proxy = self.api_worker.update_proxy( self.proxy_worker.get_proxy_id()) if proxy: self.proxy_worker.set_proxy_data( proxy[1], proxy[0]) session.proxies = self.proxy_worker.get_proxy_dict( ) count += 1 time.sleep(config.DELAY_REQUESTS) self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) continue self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) return { "status": False, "error": True, "status_code": str(response.status_code), "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } except requests.exceptions.RequestException as error: self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) return { "status": False, "error": True, "status_code": str(response.status_code), "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } # set cookies return { "status": True, "error": False, "status_code": str(response.status_code), "page_content": data, "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } return { "status": False, "error": True, "status_code": "403", "message": "Perhaps the proxy server did not respond in time. 403 HTTPError", "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) }
def get_content(self, link: str, order_id: str): """ Request page content for a given links. If the request status is 403, it requests an updated proxy server from the system api. :param order_id: str :param link: str :return: None """ count: int = 0 session = HTMLSession() session.proxies = self.proxy_worker.get_proxy_dict() session.headers = settings.LOGIN_HEADERS cookies = self.cookies_work.get_cookies() while count < self.number_attempts: try: response = session.get(link, timeout=(config.REQUEST_TIMEOUT, config.RESPONSE_TIMEOUT), cookies=cookies) session.close() except requests.exceptions.ConnectionError as error: self._send_task_report("target_connect_error", data={ "message": error.__repr__(), "code": '', "order": order_id }) return { "status": False, "error": True, "status_code": '0', "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } try: response.raise_for_status() except requests.HTTPError as error: if response.status_code == 403: if self.is_update_proxy: # update proxy server settings proxy = self.api_worker.update_proxy( self.proxy_worker.get_proxy_id()) if proxy: self.proxy_worker.set_proxy_data( proxy[1], proxy[0]) session.proxies = self.proxy_worker.get_proxy_dict( ) count += 1 time.sleep(config.DELAY_REQUESTS) self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) continue self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) return { "status": False, "error": True, "status_code": str(response.status_code), "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } except requests.exceptions.RequestException as error: self._send_task_report("main_content_error", data={ "message": error.__repr__(), "code": str(response.status_code), "order": order_id }) return { "status": False, "error": True, "status_code": str(response.status_code), "message": error.__repr__(), "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } # set cookies return { "status": True, "error": False, "status_code": str(response.status_code), "message": response.text, "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) } return { "status": False, "error": True, "status_code": "403", "message": "Perhaps the proxy server did not respond in time. 403 HTTPError", "type_res": "request_module", "proxy": tuple([ self.proxy_worker.get_proxy_id(), self.proxy_worker.get_proxy_dict() ]) }