def __init__(self, source, modname, srcname, decoded=False): # name of the module self.modname = modname # name of the source file self.srcname = srcname # file-like object yielding source lines self.source = source # cache the source code as well pos = self.source.tell() if not decoded: self.encoding = detect_encoding(self.source.readline) self.source.seek(pos) self.code = self.source.read().decode(self.encoding) self.source.seek(pos) self.source = TextIOWrapper(self.source, self.encoding) else: self.encoding = None self.code = self.source.read() self.source.seek(pos) # will be filled by tokenize() self.tokens = None # will be filled by parse() self.parsetree = None # will be filled by find_attr_docs() self.attr_docs = None self.tagorder = None # will be filled by find_tags() self.tags = None
def check(): # check for various conditions without bothering the network if len(uri) == 0 or uri[0] == '#' or \ uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': return 'unchecked', '', 0 elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): return 'local', '', 0 elif uri in self.good: return 'working', '', 0 elif uri in self.broken: return 'broken', self.broken[uri], 0 elif uri in self.redirected: return 'redirected', self.redirected[uri][0], self.redirected[ uri][1] for rex in self.to_ignore: if rex.match(uri): return 'ignored', '', 0 if '#' in uri: req_url, hash = uri.split('#', 1) else: req_url = uri hash = None # need to actually check the URI try: if hash and self.app.config.linkcheck_anchors: # Read the whole document and see if #hash exists req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(hash)) f.close() if not found: raise Exception("Anchor '%s' not found" % hash) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError, err: if err.code != 405: raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except Exception, err: self.broken[uri] = str(err) return 'broken', str(err), 0
def check(): # check for various conditions without bothering the network if len(uri) == 0 or uri[0] == '#' or \ uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:': return 'unchecked', '', 0 elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'): return 'local', '', 0 elif uri in self.good: return 'working', 'old', 0 elif uri in self.broken: return 'broken', self.broken[uri], 0 elif uri in self.redirected: return 'redirected', self.redirected[uri][0], self.redirected[ uri][1] for rex in self.to_ignore: if rex.match(uri): return 'ignored', '', 0 # split off anchor if '#' in uri: req_url, hash = uri.split('#', 1) else: req_url = uri hash = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: split = urlsplit(req_url) req_url = ( split[0].encode() + '://' + # scheme split[1].encode('idna') + # netloc quote(split[2].encode('utf-8'))) # path if split[3]: # query req_url += '?' + quote(split[3].encode('utf-8')) # go back to Unicode strings which is required by Python 3 # (but now all parts are pure ascii) req_url = req_url.decode('ascii') # need to actually check the URI try: if hash and self.app.config.linkcheck_anchors: # Read the whole document and see if #hash exists req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(hash)) f.close() if not found: raise Exception("Anchor '%s' not found" % hash) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code != 405: raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code == 401: # We'll take "Unauthorized" as working. self.good.add(uri) return 'working', ' - unauthorized', 0 else: self.broken[uri] = str(err) return 'broken', str(err), 0 except Exception as err: self.broken[uri] = str(err) return 'broken', str(err), 0 if f.url.rstrip('/') == req_url.rstrip('/'): self.good.add(uri) return 'working', '', 0 else: new_url = f.url if hash: new_url += '#' + hash code = getattr(req, 'redirect_code', 0) self.redirected[uri] = (new_url, code) return 'redirected', new_url, code
def check_uri(): # split off anchor if '#' in uri: req_url, anchor = uri.split('#', 1) else: req_url = uri anchor = None # handle non-ASCII URIs try: req_url.encode('ascii') except UnicodeError: req_url = encode_uri(req_url) try: if anchor and self.app.config.linkcheck_anchors and \ not anchor.startswith('!'): # Read the whole document and see if #anchor exists # (Anchors starting with ! are ignored since they are # commonly used for dynamic pages) req = Request(req_url) f = opener.open(req, **kwargs) encoding = 'utf-8' if hasattr(f.headers, 'get_content_charset'): encoding = f.headers.get_content_charset() or encoding else: encoding = get_content_charset(f) or encoding found = check_anchor(TextIOWrapper(f, encoding), unquote(anchor)) f.close() if not found: raise Exception("Anchor '%s' not found" % anchor) else: try: # try a HEAD request, which should be easier on # the server and the network req = HeadRequest(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code not in (403, 405): raise # retry with GET if that fails, some servers # don't like HEAD requests and reply with 403 or 405 req = Request(req_url) f = opener.open(req, **kwargs) f.close() except HTTPError as err: if err.code == 401: # We'll take "Unauthorized" as working. return 'working', ' - unauthorized', 0 else: return 'broken', str(err), 0 except Exception as err: return 'broken', str(err), 0 if f.url.rstrip('/') == req_url.rstrip('/'): return 'working', '', 0 else: new_url = f.url if anchor: new_url += '#' + anchor code = getattr(req, 'redirect_code', 0) return 'redirected', new_url, code