def __blacklist_request_if_needed(self, request: Request) -> Request: """If request's URL is blacklisted, update the request to point to a blacklisted URL.""" # FIXME there should be a better way to block those unwanted requests if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") blacklist_url_pattern = self._user_agent_config.blacklist_url_pattern() if blacklist_url_pattern: # MC_REWRITE_TO_PYTHON: a string might be coming from Perl if isinstance(blacklist_url_pattern, bytes): blacklist_url_pattern = decode_object_from_bytes_if_needed( blacklist_url_pattern) if isinstance(blacklist_url_pattern, str): blacklist_url_pattern = re.compile(blacklist_url_pattern, flags=re.IGNORECASE | re.UNICODE) if re.search(pattern=blacklist_url_pattern, string=url) is not None: request.set_url("http://0.0.0.1/%s" % url) return request
def __blacklist_request_if_needed(request: Request) -> Request: """If request's URL is blacklisted, update the request to point to a blacklisted URL.""" # FIXME there should be a better way to block those unwanted requests if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() blacklist_url_pattern = None if 'blacklist_url_pattern' in config['mediawords']: blacklist_url_pattern = config['mediawords'][ 'blacklist_url_pattern'] if blacklist_url_pattern is not None and len( blacklist_url_pattern) > 0: if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE): request.set_url("http://blacklistedsite.localhost/%s" % url) return request
def target_request_from_alarabiya_url( content: str, archive_site_url: str) -> Union[Request, None]: """alarabiya uses an interstitial that requires JavaScript. If the download URL matches alarabiya and returns the 'requires JavaScript' page, manually parse out the necessary cookie and add it to the $ua so that the request will work.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) if not is_http_url(archive_site_url): log.error("Archive site URL is not HTTP(s): %s" % archive_site_url) return None if content is None: return None if not re.search( pattern='alarabiya', string=archive_site_url, flags=re.IGNORECASE): return None if not re.search( pattern='This site requires JavaScript and Cookies to be enabled', string=content, flags=re.IGNORECASE): return None matches = re.search( pattern= r"setCookie\('(?P<cookie_name>[^']+)', '(?P<cookie_value>[^']+)'", string=content, flags=re.IGNORECASE) if matches: cookie_name = matches.group('cookie_name') cookie_value = matches.group('cookie_value') request = Request(method='GET', url=archive_site_url) request.set_header(name='Cookie', value="%s=%s" % ( cookie_name, cookie_value, )) return request else: log.warning("Unable to parse cookie from alarabiya URL %s: %s" % ( archive_site_url, content, )) return None
def request(self, request: Request) -> Response: """ Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request. Before executing the request, the method will check whether a request has been made for this domain within the last self.domain_timeout seconds. If so, the call will raise a McThrottledDomainException. Otherwise, the method will mark the time for this domain request in a postgres table and then execute UserAgent.request(). The throttling routine will not be applied after the first successful request, to allow for redirects and other followup requests to succeed. To ensure proper throttling, a new object should be create for each top level request. Accelerated domains and shortened links (eg. http://bit.ly/EFGDfrTg) get their timeout divided by _ACCELERATED_DOMAIN_SPEEDUP_FACTOR. """ if self._use_throttling: domain = mediawords.util.url.get_url_distinctive_domain( request.url()) domain_timeout = self.domain_timeout if domain_timeout > 1 and (is_shortened_url(request.url()) or domain in _ACCELERATED_DOMAINS): domain_timeout = max( 1, int(self.domain_timeout / _ACCELERATED_DOMAIN_SPEEDUP_FACTOR)) # this postgres function returns true if we are allowed to make the request and false otherwise. this # function does not use a table lock, so some extra requests might sneak through, but that's better than # dealing with a lock. we use a postgres function to make the the race condition as rare as possible. got_domain_lock = self.db.query( "select get_domain_web_requests_lock(%s, %s)", (domain, domain_timeout)).flat()[0] log.debug("domain lock obtained for %s: %s" % (str(request.url()), str(got_domain_lock))) if not got_domain_lock: raise McThrottledDomainException("domain " + str(domain) + " is locked.") else: log.debug("domain lock obtained for %s: skipped" % str(request.url())) self._use_throttling = False return super().request(request)
def get_seeded_content(db: DatabaseHandler, topic_fetch_url: dict) -> typing.Optional[str]: """Return content for this url and topic in topic_seed_urls. Arguments: db - db handle topic_fetch_url - topic_fetch_url dict from db Returns: dummy response object """ r = db.query( "select content from topic_seed_urls where topics_id = %(a)s and url = %(b)s and content is not null", { 'a': topic_fetch_url['topics_id'], 'b': topic_fetch_url['url'] }).flat() if len(r) == 0: return None response = Response(code=200, message='OK', headers={}, data=r[0]) response.set_request(Request('GET', topic_fetch_url['url'])) return response
def request(self, request: Request) -> Response: """ Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request. Before executing the request, the method will check whether a request has been made for this domain within the last self.domain_timeout seconds. If so, the call will raise a McThrottledDomainException. Otherwise, the method will mark the time for this domain request in a postgres table and then execute UserAgent.request(). The throttling routine will not be applied after the first successful request, to allow for redirects and other followup requests to succeed. To ensure proper throttling, a new object should be create for each top level request. Accelerated domains and shortened links (eg. http://bit.ly/EFGDfrTg) get their timeout divided by _ACCELERATED_DOMAIN_SPEEDUP_FACTOR. """ if self._use_throttling: domain = mediawords.util.url.get_url_distinctive_domain(request.url()) domain_timeout = self.domain_timeout if domain_timeout > 1 and (is_shortened_url(request.url()) or domain in _ACCELERATED_DOMAINS): domain_timeout = max(1, int(self.domain_timeout / _ACCELERATED_DOMAIN_SPEEDUP_FACTOR)) # this postgres function returns true if we are allowed to make the request and false otherwise. this # function does not use a table lock, so some extra requests might sneak through, but that's better than # dealing with a lock. we use a postgres function to make the the race condition as rare as possible. got_domain_lock = self.db.query( "select get_domain_web_requests_lock(%s, %s)", (domain, domain_timeout)).flat()[0] log.debug("domain lock obtained for %s: %s" % (str(request.url()), str(got_domain_lock))) if not got_domain_lock: raise McThrottledDomainException("domain " + str(domain) + " is locked.") else: log.debug("domain lock obtained for %s: skipped" % str(request.url())) self._use_throttling = False return super().request(request)
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log') with open(http_request_log_path, 'a') as f: while True: try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) break except IOError as e: # raise on unrelated IOErrors if e.errno != errno.EAGAIN: raise else: log.warning("Waiting for HTTP request log lock...") time.sleep(0.1) f.write("%s %s\n" % ( sql_now(), url, )) # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself fcntl.flock(f, fcntl.LOCK_UN) # Processes from various users (web service, workers, ...) will want to write to the same file try: os.chmod(http_request_log_path, 0o666) except PermissionError as ex: # Web server process might attempt at chmodding the file without the appropriate permissions log.debug("Failed to chmod %s: %s" % ( http_request_log_path, str(ex), )) pass
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") log.debug("HTTP request: %s %s\n" % (sql_now(), url,))
def __blacklist_request_if_needed(request: Request) -> Request: """If request's URL is blacklisted, update the request to point to a blacklisted URL.""" # FIXME there should be a better way to block those unwanted requests if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() blacklist_url_pattern = None if 'blacklist_url_pattern' in config['mediawords']: blacklist_url_pattern = config['mediawords']['blacklist_url_pattern'] if blacklist_url_pattern is not None and len(blacklist_url_pattern) > 0: if re.search(pattern=blacklist_url_pattern, string=url, flags=re.IGNORECASE | re.UNICODE) is not None: request.set_url("http://0.0.0.1/%s" % url) return request
def target_request_from_meta_refresh_url( content: str, archive_site_url: str) -> Union[Request, None]: """Given a URL and content from website with META refresh, return a request for the original URL.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) if content is None: return None target_url = meta_refresh_url_from_html(html=content, base_url=archive_site_url) if target_url is None: return None return Request(method='GET', url=target_url)
def __prepare_request(self, request: Request) -> requests.PreparedRequest: """Create PreparedRequest from UserAgent's Request. Raises if one or more parameters are invalid.""" method = request.method() if method is None: raise McRequestException("Request's method is None.") url = request.url() if url is None: raise McRequestException("Request's URL is None.") headers = request.headers() if headers is None: raise McRequestException("Request's headers is None.") auth_username = request.auth_username() auth_password = request.auth_password() if ((auth_username is None and auth_password is not None) or (auth_username is not None and auth_password is None)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not None." ) auth = None if auth_username is not None and auth_password is not None: if ((len(auth_username) == 0 and len(auth_password) > 0) or (len(auth_username) > 0 and len(auth_password) == 0)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not Empty." ) auth = HTTPBasicAuth(auth_username, auth_password) data = request.content() try: requests_request = requests.Request( method=method, url=url, data=data, headers=headers, auth=auth, ) requests_prepared_request = self.__session.prepare_request( requests_request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) return requests_prepared_request
def get(self, url: str) -> Response: """GET an URL.""" url = decode_object_from_bytes_if_needed(url) if url is None: raise McGetException("URL is None.") url = fix_common_url_mistakes(url) if not is_http_url(url): raise McGetException("URL is not HTTP(s): %s" % url) # Add HTTP authentication url = self.__url_with_http_auth(url=url) request = Request(method='GET', url=url) return self.request(request)
def __log_request(request: Request) -> None: """Log HTTP request.""" # FIXME use Python's logging facilities if request is None: raise McRequestException("Request is None.") url = request.url() if url is None: raise McRequestException("URL is None.") if len(url) == 0: raise McRequestException("URL is empty.") config = py_get_config() http_request_log_path = os.path.join(config['mediawords']['data_dir'], 'logs', 'http_request.log') with open(http_request_log_path, encoding='utf-8', mode='a') as f: while True: try: fcntl.flock(f, fcntl.LOCK_EX | fcntl.LOCK_NB) break except IOError as e: # raise on unrelated IOErrors if e.errno != errno.EAGAIN: raise else: log.warning("Waiting for HTTP request log lock...") time.sleep(0.1) f.write("%s %s\n" % (sql_now(), url,)) # Doesn't write "invalidating blacklist url <...> because it's apparent from the URL itself fcntl.flock(f, fcntl.LOCK_UN) # Processes from various users (web service, workers, ...) will want to write to the same file try: os.chmod(http_request_log_path, 0o666) except PermissionError as ex: # Web server process might attempt at chmodding the file without the appropriate permissions log.debug("Failed to chmod %s: %s" % (http_request_log_path, str(ex),)) pass
def target_request_from_archive_is_url( content: str, archive_site_url: str) -> Union[Request, None]: """Given a URL and content from archive.is, return a request for the original URL.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) if content is None: return None if re.match(pattern=r'^https?://archive\.is/(.+?)$', string=archive_site_url, flags=re.IGNORECASE): canonical_link = link_canonical_url_from_html(html=content) if canonical_link is not None: matches = re.match( pattern= r'^https?://archive\.is/\d+?/(?P<target_url>https?://.+?)$', string=canonical_link, flags=re.IGNORECASE) if matches: target_url = matches.group('target_url') if is_http_url(target_url): return Request(method='GET', url=target_url) else: log.error("URL matched, but is not HTTP(s): %s" % target_url) else: log.error( "Unable to parse original URL from archive.is response '%s': %s" % ( archive_site_url, canonical_link, )) else: log.error( "Unable to parse original URL from archive.is response '%s'" % archive_site_url) return None
def target_request_from_archive_org_url( content: Union[str, None], archive_site_url: str) -> Union[Request, None]: """Given a URL and content from archive.org, return a request for the original URL.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) matches = re.match( pattern= r'^https?://web\.archive\.org/web/(?P<date>\d+?/)?(?P<target_url>https?://.+?)$', string=archive_site_url, flags=re.IGNORECASE) if matches: target_url = matches.group('target_url') if is_http_url(target_url): return Request(method='GET', url=target_url) else: log.error("URL matched, but is not HTTP(s): %s" % target_url) return None
def request(self, request: Request) -> Response: """ Execute domain throttled version of mediawords.util.web.user_agent.UserAgent.request. Before executing the request, the method will check whether a request has been made for this domain within the last self.domain_timeout seconds. If so, the call will raise a McThrottledUserAgentTimeoutException. Otherwise, the method will mark the time for this domain request in a postgres table and then execute UserAgent.request(). """ domain = mediawords.util.url.get_url_distinctive_domain(request.url()) # this postgres function returns true if we are allowed to make the request and false otherwise. # this function does not use a table lock, so some extra requests might sneak through, but that's better than # dealing with a lock. we use a postgres function to make the the race condition as rare as possible. got_domain_lock = self.db.query( "select get_domain_web_requests_lock(%s, %s)", (domain, self.domain_timeout)).flat()[0] if not got_domain_lock: raise McThrottledUserAgentTimeoutException("domain " + str(domain) + " is locked.") return super(ThrottledUserAgent, self).request(request)
def __prepare_request(self, request: Request) -> requests.PreparedRequest: """Create PreparedRequest from UserAgent's Request. Raises if one or more parameters are invalid.""" method = request.method() if method is None: raise McRequestException("Request's method is None.") url = request.url() if url is None: raise McRequestException("Request's URL is None.") headers = request.headers() if headers is None: raise McRequestException("Request's headers is None.") auth_username = request.auth_username() auth_password = request.auth_password() if ((auth_username is None and auth_password is not None) or ( auth_username is not None and auth_password is None)): raise McRequestException("Either both or none of HTTP authentication credentials must be not None.") auth = None if auth_username is not None and auth_password is not None: if ((len(auth_username) == 0 and len(auth_password) > 0) or ( len(auth_username) > 0 and len(auth_password) == 0)): raise McRequestException("Either both or none of HTTP authentication credentials must be not Empty.") auth = HTTPBasicAuth(auth_username, auth_password) data = request.content() try: requests_request = requests.Request( method=method, url=url, data=data, headers=headers, auth=auth, ) requests_prepared_request = self.__session.prepare_request(requests_request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % (str(request), str(ex),)) return requests_prepared_request
def _get_api_key() -> str: """Fetch the bw api key or use the cached one. To get a bw api key, you have to make an api call with the user and password, but the api key only lasts for a year, so we just get it and then cache it in a static variable, assuming that each run time will restart at least once a year. """ if hasattr(_get_api_key, "api_key"): return _get_api_key.api_key user = env_value('MC_BRANDWATCH_USER') password = env_value('MC_BRANDWATCH_PASSWORD') log.debug(f"user: {user}") log.debug(f"passwod: {password}") ua = _get_user_agent() url = ( "https://api.brandwatch.com/oauth/token?username=%s&grant_type=api-password&client_id=brandwatch-api-client" % (quote(user))) request = Request(method='POST', url=url) request.set_content_type( 'application/x-www-form-urlencoded; charset=utf-8') request.set_content({'password': password}) response = ua.request(request) if not response.is_success(): raise McPostsBWTwitterDataException("error fetching posts: " + response.decoded_content()) json = response.decoded_content() data = dict(decode_json(json)) try: _get_api_key.api_key = data['access_token'] except: raise McPostsBWTwitterDataException( "error parsing ouath response: '%s'" % json) return _get_api_key.api_key
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) method = request.method() if method is None: raise McRequestException("Request's method is None.") url = request.url() if url is None: raise McRequestException("Request's URL is None.") headers = request.headers() if headers is None: raise McRequestException("Request's headers is None.") auth_username = request.auth_username() auth_password = request.auth_password() if ((auth_username is None and auth_password is not None) or (auth_username is not None and auth_password is None)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not None." ) auth = None if auth_username is not None and auth_password is not None: if ((len(auth_username) == 0 and len(auth_password) > 0) or (len(auth_username) > 0 and len(auth_password) == 0)): raise McRequestException( "Either both or none of HTTP authentication credentials must be not Empty." ) auth = HTTPBasicAuth(auth_username, auth_password) data = request.content() try: requests_request = requests.Request( method=method, url=url, data=data, headers=headers, auth=auth, ) requests_prepared_request = self.__session.prepare_request( requests_request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) error_is_client_side = False try: requests_response = self.__session.send( request=requests_prepared_request, timeout=self.timeout(), # To be able to enforce max_size stream=True, ) except requests.TooManyRedirects as ex: # On too many redirects, return the last fetched page (just like LWP::UserAgent does) log.warning("Exceeded max. redirects for URL %s" % request.url()) requests_response = ex.response response_data = str(ex) except requests.Timeout as ex: log.warning("Timeout for URL %s" % request.url()) # We treat timeouts as client-side errors too because we can retry on them error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase requests_response.request = requests_prepared_request requests_response.history = [] response_data = str(ex) except Exception as ex: # Client-side error log.warning("Client-side error while processing request %s: %s" % ( str(request), str(ex), )) error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.BAD_REQUEST.value requests_response.reason = "Client-side error" requests_response.request = requests_prepared_request # Previous request / response chain is not built for client-side errored requests requests_response.history = [] requests_response.headers = { # LWP::UserAgent compatibility 'Client-Warning': 'Client-side error', } response_data = str(ex) else: try: max_size = self.max_size() response_data = "" read_response_data = True if max_size is not None: content_length = requests_response.headers.get( 'Content-Length', None) if content_length is not None: content_length = int(content_length) if content_length > max_size: log.warning( "Content-Length exceeds %d for URL %s" % ( max_size, url, )) # Release the response to return connection back to the pool # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow) requests_response.close() read_response_data = False if read_response_data: if requests_response.encoding is None: if requests_response.apparent_encoding is None: # If encoding is not in HTTP headers nor can be determined from content itself, assume that # it's UTF-8 requests_response.encoding = 'UTF-8' else: # Test the encoding guesser's opinion, just like browsers do requests_response.encoding = requests_response.apparent_encoding else: # If "Content-Type" HTTP header contains a string "text" and doesn't have "charset" property, # "requests" falls back to setting the encoding to ISO-8859-1, which is probably not right # (encoding might have been defined in the HTML content itself via <meta> tag), so we use the # "apparent encoding" instead if requests_response.encoding.lower() == 'iso-8859-1': if requests_response.apparent_encoding is not None: requests_response.encoding = requests_response.apparent_encoding # Some pages report some funky encoding; in that case, fallback to UTF-8 try: codecs.lookup(requests_response.encoding) except LookupError: log.warning("Invalid encoding %s for URL %s" % (requests_response.encoding, requests_response.url)) requests_response.encoding = 'UTF-8' response_data_size = 0 for chunk in requests_response.iter_content( chunk_size=None, decode_unicode=True): response_data += chunk response_data_size += len(chunk) # Content-Length might be missing / lying, so we measure size while fetching the data too if max_size is not None: if response_data_size > max_size: log.warning("Data size exceeds %d for URL %s" % ( max_size, url, )) # Release the response to return connection back to the pool # (http://docs.python-requests.org/en/master/user/advanced/#body-content-workflow) requests_response.close() break except requests.RequestException as ex: log.warning("Error reading data for URL %s" % request.url()) # We treat timeouts as client-side errors too because we can retry on them error_is_client_side = True requests_response = requests.Response() requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase requests_response.request = requests_prepared_request requests_response.history = [] response_data = str(ex) if requests_response is None: raise McRequestException("Response from 'requests' is None.") if response_data is None: # Probably a programming error raise McRequestException("Response data is None.") response = Response.from_requests_response( requests_response=requests_response, data=response_data, ) if error_is_client_side: response.set_error_is_client_side( error_is_client_side=error_is_client_side) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed(requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request) previous_response = Response.from_requests_response( requests_response=previous_rq_response) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=requests_response.request) response.set_request(response_request) return response
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) try: requests_prepared_request = self.__prepare_request(request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) try: user_agent_response = self.__execute_request( requests_prepared_request) except Exception as ex: raise McRequestException("Unable to execute request %s: %s" % ( str(requests_prepared_request), str(ex), )) try: response_data = self.__read_response_data( user_agent_response.requests_response) except Exception as ex: log.warning("Error reading data for URL %s" % request.url()) user_agent_response.requests_response = requests.Response() user_agent_response.requests_response.status_code = HTTPStatus.REQUEST_TIMEOUT.value user_agent_response.requests_response.reason = HTTPStatus.REQUEST_TIMEOUT.phrase user_agent_response.requests_response.request = requests_prepared_request user_agent_response.requests_response.history = [] # We treat timeouts as client-side errors too because we can retry on them user_agent_response.error_is_client_side = True response_data = str(ex) if user_agent_response.requests_response is None: raise McRequestException("Response from 'requests' is None.") if response_data is None: # Probably a programming error raise McRequestException("Response data is None.") response = Response.from_requests_response( requests_response=user_agent_response.requests_response, data=response_data, ) if user_agent_response.error_is_client_side is True: response.set_error_is_client_side( error_is_client_side=user_agent_response.error_is_client_side) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed( user_agent_response.requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request) # Sometimes reading the (chunked?) previous response's data fails with: # # AttributeError: 'NoneType' object has no attribute 'readline' # # Previous response's data is not that important, so fail rather silently. try: previous_rq_response_data = previous_rq_response.text except Exception as ex: log.warning("Reading previous response's data failed: %s" % str(ex)) previous_rq_response_data = '' previous_response = Response.from_requests_response( requests_response=previous_rq_response, data=previous_rq_response_data) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=user_agent_response.requests_response. request) response.set_request(response_request) return response
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) try: requests_prepared_request = self.__prepare_request(request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % (str(request), str(ex),)) try: user_agent_response = self.__execute_request(requests_prepared_request) except Exception as ex: raise McRequestException("Unable to execute request %s: %s" % (str(requests_prepared_request), str(ex),)) if user_agent_response.requests_response is None: raise McRequestException("Response from 'requests' is None.") response = Response( requests_response=user_agent_response.requests_response, max_size=self.max_size(), error_is_client_side=user_agent_response.error_is_client_side, ) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed(user_agent_response.requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request ) # Sometimes reading the (chunked?) previous response's data fails with: # # AttributeError: 'NoneType' object has no attribute 'readline' # # Previous response's data is not that important, so fail rather silently. try: previous_rq_response.text except Exception as ex: log.warning("Reading previous response's data failed: %s" % str(ex)) previous_rq_response.raw_data = io.StringIO('') previous_response = Response(requests_response=previous_rq_response, max_size=self.max_size()) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=user_agent_response.requests_response.request ) response.set_request(response_request) return response
def target_request_from_linkis_com_url( content: str, archive_site_url: str) -> Union[Request, None]: """Given the content of a linkis.com web page, find the original URL in the content, which may be in one of sereral places in the DOM, and return a request for said URL.""" content = decode_object_from_bytes_if_needed(content) archive_site_url = decode_object_from_bytes_if_needed(archive_site_url) if content is None: return None if not re.match(pattern='^https?://[^/]*linkis.com/', string=archive_site_url, flags=re.IGNORECASE): return None # list of dom search patterns to find nodes with a url and the # attributes to use from those nodes as the url. # # for instance the first item matches: # # <meta property="og:url" content="http://foo.bar"> # try: html_parser = etree.HTMLParser() html_tree = etree.parse(StringIO(content), html_parser) dom_maps = [ ('//meta[@property="og:url"]', 'content'), ('//a[@class="js-youtube-ln-event"]', 'href'), ('//iframe[@id="source_site"]', 'src'), ] for xpath, url_attribute in dom_maps: nodes = html_tree.xpath(xpath) if len(nodes) > 0: first_node = nodes[0] matched_url = first_node.get(url_attribute) if matched_url is not None: if not re.match(pattern='^https?://linkis.com', string=matched_url, flags=re.IGNORECASE): if is_http_url(matched_url): return Request(method='GET', url=matched_url) else: log.error("URL matched, but is not HTTP(s): %s" % matched_url) except Exception as ex: log.warning("Unable to parse HTML for URL %s: %s" % ( archive_site_url, str(ex), )) # As a last resort, look for the longUrl key in a JavaScript array matches = re.search(pattern=r'"longUrl":\s*"(?P<target_url>[^"]+)"', string=content, flags=re.IGNORECASE) if matches: target_url = matches.group('target_url') # kludge to de-escape \'d characters in javascript -- 99% of urls # are captured by the dom stuff above, we shouldn't get to this # point often target_url = target_url.replace('\\', '') if not re.match(pattern='^https?://linkis.com', string=target_url, flags=re.IGNORECASE): if is_http_url(target_url): return Request(method='GET', url=target_url) else: log.error("URL matched, but is not HTTP(s): %s" % target_url) log.warning("No URL found for linkis URL: %s" % archive_site_url) return None
def _make_dummy_bypassed_response(url: str) -> Response: """Given a url, make and return a response object with that url and empty content.""" response = Response(code=200, message='OK', headers={}, data='') response.set_request(Request('GET', url)) return response
def request(self, request: Request) -> Response: """Execute a request, return a response. All other helpers are supposed to use request() internally as it implements max. size, callbacks, blacklisted URLs etc.""" if request is None: raise McRequestException("Request is None.") request = self.__blacklist_request_if_needed(request=request) self.__log_request(request=request) try: requests_prepared_request = self.__prepare_request(request) except Exception as ex: raise McRequestException("Unable to prepare request %s: %s" % ( str(request), str(ex), )) try: user_agent_response = self.__execute_request( requests_prepared_request) except Exception as ex: raise ex raise McRequestException("Unable to execute request %s: %s" % ( str(requests_prepared_request), str(ex), )) if user_agent_response.requests_response is None: raise McRequestException("Response from 'requests' is None.") response = Response( requests_response=user_agent_response.requests_response, max_size=self.max_size(), error_is_client_side=user_agent_response.error_is_client_side, ) # Build the previous request / response chain from the redirects current_response = response for previous_rq_response in reversed( user_agent_response.requests_response.history): previous_rq_request = previous_rq_response.request previous_response_request = Request.from_requests_prepared_request( requests_prepared_request=previous_rq_request) # Sometimes reading the (chunked?) previous response's data fails with: # # AttributeError: 'NoneType' object has no attribute 'readline' # # Previous response's data is not that important, so fail rather silently. try: previous_rq_response.text except Exception as ex: log.warning("Reading previous response's data failed: %s" % str(ex)) previous_rq_response.raw_data = io.StringIO('') previous_response = Response( requests_response=previous_rq_response, max_size=self.max_size()) previous_response.set_request(request=previous_response_request) current_response.set_previous(previous=previous_response) current_response = previous_response # Redirects might have happened, so we have to recreate the request object from the latest page that was # redirected to response_request = Request.from_requests_prepared_request( requests_prepared_request=user_agent_response.requests_response. request) response.set_request(response_request) return response