def test_encode_uri(): expected = (u'https://ru.wikipedia.org/wiki/%D0%A1%D0%B8%D1%81%D1%82%D0%B5%D0%BC%D0%B0_' u'%D1%83%D0%BF%D1%80%D0%B0%D0%B2%D0%BB%D0%B5%D0%BD%D0%B8%D1%8F_' u'%D0%B1%D0%B0%D0%B7%D0%B0%D0%BC%D0%B8_%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85') uri = u'https://ru.wikipedia.org/wiki/Система_управления_базами_данных' assert expected, encode_uri(uri) expected = (u'https://github.com/search?utf8=%E2%9C%93&q=is%3Aissue+is%3Aopen+is%3A' u'sprint-friendly+user%3Ajupyter&type=Issues&ref=searchresults') uri = (u'https://github.com/search?utf8=✓&q=is%3Aissue+is%3Aopen+is%3A' u'sprint-friendly+user%3Ajupyter&type=Issues&ref=searchresults') assert expected, encode_uri(uri)
def test_encode_uri(): expected = ('https://ru.wikipedia.org/wiki/%D0%A1%D0%B8%D1%81%D1%82%D0%B5%D0%BC%D0%B0_' '%D1%83%D0%BF%D1%80%D0%B0%D0%B2%D0%BB%D0%B5%D0%BD%D0%B8%D1%8F_' '%D0%B1%D0%B0%D0%B7%D0%B0%D0%BC%D0%B8_%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85') uri = ('https://ru.wikipedia.org/wiki' '/Система_управления_базами_данных') assert expected == encode_uri(uri) expected = ('https://github.com/search?utf8=%E2%9C%93&q=is%3Aissue+is%3Aopen+is%3A' 'sprint-friendly+user%3Ajupyter&type=Issues&ref=searchresults') uri = ('https://github.com/search?utf8=✓&q=is%3Aissue+is%3Aopen+is%3A' 'sprint-friendly+user%3Ajupyter&type=Issues&ref=searchresults') assert expected == encode_uri(uri)
def check_uri(): # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, headers=self.headers, **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, headers=self.headers, **kwargs) response.raise_for_status() except HTTPError as err: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, headers=self.headers, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: if is_ssl_error(err): return 'ignored', str(err), 0 else: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code
def check_uri(): # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors and \ not anchor.startswith('!'): # Read the whole document and see if #anchor exists # (Anchors starting with ! are ignored since they are # commonly used for dynamic pages) response = self.session.get(req_url, stream=True, **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request, which should be easier on # the server and the network response = self.session.head(req_url, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code not in (403, 405): raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 403 or 405 response = self.session.get(req_url, stream=True, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code
def check_uri(): # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors and \ not anchor.startswith('!'): # Read the whole document and see if #anchor exists # (Anchors starting with ! are ignored since they are # commonly used for dynamic pages) response = requests.get(req_url, stream=True, **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request, which should be easier on # the server and the network response = requests.head(req_url, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code not in (403, 405): raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 403 or 405 response = requests.get(req_url, stream=True, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code
def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() found = check_anchor(response, unquote(anchor)) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() found = check_anchor(response, unquote(anchor)) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: if isinstance( err, HTTPError) and err.response.status_code == 429: raise # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 429: next_check = self.limit_rate(err.response) if next_check is not None: self.wqueue.put(CheckRequest(next_check, hyperlink), False) return 'rate-limited', '', 0 return 'broken', str(err), 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 else: netloc = urlparse(req_url).netloc try: del self.rate_limits[netloc] except KeyError: pass if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor if allowed_redirect(req_url, new_url): return 'working', '', 0 elif response.history: # history contains any redirects, get last code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri(): # type: () -> Tuple[unicode, unicode, int] # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: if is_ssl_error(err): return 'ignored', str(err), 0 else: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code
def check_uri(): # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors and \ not anchor.startswith('!'): # Read the whole document and see if #anchor exists # (Anchors starting with ! are ignored since they are # commonly used for dynamic pages) req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding else: encoding = get_content_charset(f) or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(anchor)) f.close() if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code not in (403, 405): raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 403 or 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if f.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = f.url if anchor: new_url += '#' + anchor code = getattr(req, 'redirect_code', 0) return 'redirected', new_url, code
def check(): # check for various conditions without bothering the network if len(uri) == 0 or uri[0] == '#' or \ uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': return 'unchecked', '', 0 elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): return 'local', '', 0 elif uri in self.good: return 'working', 'old', 0 elif uri in self.broken: return 'broken', self.broken[uri], 0 elif uri in self.redirected: return 'redirected', self.redirected[uri][0], self.redirected[uri][1] for rex in self.to_ignore: if rex.match(uri): return 'ignored', '', 0 # split off anchor if '#' in uri: req_url, hash = uri.split('#', 1) else: req_url = uri hash = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # need to actually check the URI try: if hash and self.app.config.linkcheck_anchors: # Read the whole document and see if #hash exists req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(hash)) f.close() if not found: raise Exception("Anchor '%s' not found" % hash) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code != 405: raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code == 401: # We'll take "Unauthorized" as working. self.good.add(uri) return 'working', ' - unauthorized', 0 else: self.broken[uri] = str(err) return 'broken', str(err), 0 except Exception as err: self.broken[uri] = str(err) return 'broken', str(err), 0 if f.url.rstrip('/') == req_url.rstrip('/'): self.good.add(uri) return 'working', '', 0 else: new_url = f.url if hash: new_url += '#' + hash code = getattr(req, 'redirect_code', 0) self.redirected[uri] = (new_url, code) return 'redirected', new_url, code
def check(): # check for various conditions without bothering the network if len(uri) == 0 or uri[0] == '#' or \ uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': return 'unchecked', '', 0 elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): return 'local', '', 0 elif uri in self.good: return 'working', 'old', 0 elif uri in self.broken: return 'broken', self.broken[uri], 0 elif uri in self.redirected: return 'redirected', self.redirected[uri][0], self.redirected[ uri][1] for rex in self.to_ignore: if rex.match(uri): return 'ignored', '', 0 # split off anchor if '#' in uri: req_url, hash = uri.split('#', 1) else: req_url = uri hash = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # need to actually check the URI try: if hash and self.app.config.linkcheck_anchors: # Read the whole document and see if #hash exists req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(hash)) f.close() if not found: raise Exception("Anchor '%s' not found" % hash) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code != 405: raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code == 401: # We'll take "Unauthorized" as working. self.good.add(uri) return 'working', ' - unauthorized', 0 else: self.broken[uri] = str(err) return 'broken', str(err), 0 except Exception as err: self.broken[uri] = str(err) return 'broken', str(err), 0 if f.url.rstrip('/') == req_url.rstrip('/'): self.good.add(uri) return 'working', '', 0 else: new_url = f.url if hash: new_url += '#' + hash code = getattr(req, 'redirect_code', 0) self.redirected[uri] = (new_url, code) return 'redirected', new_url, code
def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() anchor_str = unquote(anchor) # Hack (?): https://github.com/container-storage-interface/spec/blob/master/spec.md#getplugininfo # is a valid anchor, but the actual id of the anchor is user-content-getplugininfo, which causes # the anchor check to fail: # <a id="user-content-getplugininfo" class="anchor" aria-hidden="true" href="#getplugininfo"> # # Might have to be fixed in AnchorCheckParser instead? if req_url.startswith('https://github.com/'): anchor_str = "user-content-" + anchor_str found = check_anchor(response, anchor_str) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except (HTTPError, TooManyRedirects) as err: if isinstance( err, HTTPError) and err.response.status_code == 429: raise # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 429: next_check = self.limit_rate(err.response) if next_check is not None: self.wqueue.put(CheckRequest(next_check, hyperlink), False) return 'rate-limited', '', 0 return 'broken', str(err), 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 else: netloc = urlparse(req_url).netloc try: del self.rate_limits[netloc] except KeyError: pass if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri(): # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors and \ not anchor.startswith('!'): # Read the whole document and see if #anchor exists # (Anchors starting with ! are ignored since they are # commonly used for dynamic pages) req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding else: encoding = get_content_charset(f) or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(anchor)) f.close() if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code != 405: raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if f.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = f.url if anchor: new_url += '#' + anchor code = getattr(req, 'redirect_code', 0) return 'redirected', new_url, code