def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() found = check_anchor(response, unquote(anchor)) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.app.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() found = check_anchor(response, unquote(anchor)) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() # Servers drop the connection on HEAD requests, causing # ConnectionError. except (ConnectionError, HTTPError, TooManyRedirects) as err: if isinstance( err, HTTPError) and err.response.status_code == 429: raise # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 429: next_check = self.limit_rate(err.response) if next_check is not None: self.wqueue.put(CheckRequest(next_check, hyperlink), False) return 'rate-limited', '', 0 return 'broken', str(err), 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 else: netloc = urlparse(req_url).netloc try: del self.rate_limits[netloc] except KeyError: pass if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor if allowed_redirect(req_url, new_url): return 'working', '', 0 elif response.history: # history contains any redirects, get last code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri(): # type: () -> Tuple[unicode, unicode, int] # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: if is_ssl_error(err): return 'ignored', str(err), 0 else: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0
def check_uri(): # type: () -> Tuple[unicode, unicode, int] # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) found = check_anchor(response, unquote(anchor)) if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.app.config, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: if is_ssl_error(err): return 'ignored', str(err), 0 else: return 'broken', str(err), 0 if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code
def check_uri() -> Tuple[str, str, int]: # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) for rex in self.anchors_ignore: if rex.match(anchor): anchor = None break else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) # Get auth info, if any for pattern, auth_info in self.auth: if pattern.match(uri): break else: auth_info = None # update request headers for the URL kwargs['headers'] = get_request_headers() try: if anchor and self.config.linkcheck_anchors: # Read the whole document and see if #anchor exists response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() anchor_str = unquote(anchor) # Hack (?): https://github.com/container-storage-interface/spec/blob/master/spec.md#getplugininfo # is a valid anchor, but the actual id of the anchor is user-content-getplugininfo, which causes # the anchor check to fail: # <a id="user-content-getplugininfo" class="anchor" aria-hidden="true" href="#getplugininfo"> # # Might have to be fixed in AnchorCheckParser instead? if req_url.startswith('https://github.com/'): anchor_str = "user-content-" + anchor_str found = check_anchor(response, anchor_str) if not found: raise Exception(__("Anchor '%s' not found") % anchor) else: try: # try a HEAD request first, which should be easier on # the server and the network response = requests.head(req_url, allow_redirects=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except (HTTPError, TooManyRedirects) as err: if isinstance( err, HTTPError) and err.response.status_code == 429: raise # retry with GET request if that fails, some servers # don't like HEAD requests. response = requests.get(req_url, stream=True, config=self.config, auth=auth_info, **kwargs) response.raise_for_status() except HTTPError as err: if err.response.status_code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 elif err.response.status_code == 429: next_check = self.limit_rate(err.response) if next_check is not None: self.wqueue.put(CheckRequest(next_check, hyperlink), False) return 'rate-limited', '', 0 return 'broken', str(err), 0 elif err.response.status_code == 503: # We'll take "Service Unavailable" as ignored. return 'ignored', str(err), 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 else: netloc = urlparse(req_url).netloc try: del self.rate_limits[netloc] except KeyError: pass if response.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = response.url if anchor: new_url += '#' + anchor # history contains any redirects, get last if response.history: code = response.history[-1].status_code return 'redirected', new_url, code else: return 'redirected', new_url, 0