예제 #1
0
    def __init__(self, source, modname, srcname, decoded=False):
        # name of the module
        self.modname = modname
        # name of the source file
        self.srcname = srcname
        # file-like object yielding source lines
        self.source = source

        # cache the source code as well
        pos = self.source.tell()
        if not decoded:
            self.encoding = detect_encoding(self.source.readline)
            self.source.seek(pos)
            self.code = self.source.read().decode(self.encoding)
            self.source.seek(pos)
            self.source = TextIOWrapper(self.source, self.encoding)
        else:
            self.encoding = None
            self.code = self.source.read()
            self.source.seek(pos)

        # will be filled by tokenize()
        self.tokens = None
        # will be filled by parse()
        self.parsetree = None
        # will be filled by find_attr_docs()
        self.attr_docs = None
        self.tagorder = None
        # will be filled by find_tags()
        self.tags = None
예제 #2
0
        def check():
            # check for various conditions without bothering the network
            if len(uri) == 0 or uri[0] == '#' or \
               uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
                return 'unchecked', '', 0
            elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
                return 'local', '', 0
            elif uri in self.good:
                return 'working', '', 0
            elif uri in self.broken:
                return 'broken', self.broken[uri], 0
            elif uri in self.redirected:
                return 'redirected', self.redirected[uri][0], self.redirected[
                    uri][1]
            for rex in self.to_ignore:
                if rex.match(uri):
                    return 'ignored', '', 0

            if '#' in uri:
                req_url, hash = uri.split('#', 1)
            else:
                req_url = uri
                hash = None

            # need to actually check the URI
            try:
                if hash and self.app.config.linkcheck_anchors:
                    # Read the whole document and see if #hash exists
                    req = Request(req_url)
                    f = opener.open(req, **kwargs)
                    encoding = 'utf-8'
                    if hasattr(f.headers, 'get_content_charset'):
                        encoding = f.headers.get_content_charset() or encoding
                    found = check_anchor(TextIOWrapper(f, encoding),
                                         unquote(hash))
                    f.close()

                    if not found:
                        raise Exception("Anchor '%s' not found" % hash)
                else:
                    try:
                        # try a HEAD request, which should be easier on
                        # the server and the network
                        req = HeadRequest(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
                    except HTTPError, err:
                        if err.code != 405:
                            raise
                        # retry with GET if that fails, some servers
                        # don't like HEAD requests and reply with 405
                        req = Request(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()

            except Exception, err:
                self.broken[uri] = str(err)
                return 'broken', str(err), 0
예제 #3
0
파일: linkcheck.py 프로젝트: th0/test2
        def check():
            # check for various conditions without bothering the network
            if len(uri) == 0 or uri[0] == '#' or \
               uri[0:7] == 'mailto:' or uri[0:4] == 'ftp:':
                return 'unchecked', '', 0
            elif not (uri[0:5] == 'http:' or uri[0:6] == 'https:'):
                return 'local', '', 0
            elif uri in self.good:
                return 'working', 'old', 0
            elif uri in self.broken:
                return 'broken', self.broken[uri], 0
            elif uri in self.redirected:
                return 'redirected', self.redirected[uri][0], self.redirected[
                    uri][1]
            for rex in self.to_ignore:
                if rex.match(uri):
                    return 'ignored', '', 0

            # split off anchor
            if '#' in uri:
                req_url, hash = uri.split('#', 1)
            else:
                req_url = uri
                hash = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                split = urlsplit(req_url)
                req_url = (
                    split[0].encode() + '://' +  # scheme
                    split[1].encode('idna') +  # netloc
                    quote(split[2].encode('utf-8')))  # path
                if split[3]:  # query
                    req_url += '?' + quote(split[3].encode('utf-8'))
                # go back to Unicode strings which is required by Python 3
                # (but now all parts are pure ascii)
                req_url = req_url.decode('ascii')

            # need to actually check the URI
            try:
                if hash and self.app.config.linkcheck_anchors:
                    # Read the whole document and see if #hash exists
                    req = Request(req_url)
                    f = opener.open(req, **kwargs)
                    encoding = 'utf-8'
                    if hasattr(f.headers, 'get_content_charset'):
                        encoding = f.headers.get_content_charset() or encoding
                    found = check_anchor(TextIOWrapper(f, encoding),
                                         unquote(hash))
                    f.close()

                    if not found:
                        raise Exception("Anchor '%s' not found" % hash)
                else:
                    try:
                        # try a HEAD request, which should be easier on
                        # the server and the network
                        req = HeadRequest(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
                    except HTTPError as err:
                        if err.code != 405:
                            raise
                        # retry with GET if that fails, some servers
                        # don't like HEAD requests and reply with 405
                        req = Request(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
            except HTTPError as err:
                if err.code == 401:
                    # We'll take "Unauthorized" as working.
                    self.good.add(uri)
                    return 'working', ' - unauthorized', 0
                else:
                    self.broken[uri] = str(err)
                    return 'broken', str(err), 0
            except Exception as err:
                self.broken[uri] = str(err)
                return 'broken', str(err), 0
            if f.url.rstrip('/') == req_url.rstrip('/'):
                self.good.add(uri)
                return 'working', '', 0
            else:
                new_url = f.url
                if hash:
                    new_url += '#' + hash
                code = getattr(req, 'redirect_code', 0)
                self.redirected[uri] = (new_url, code)
                return 'redirected', new_url, code
예제 #4
0
        def check_uri():
            # split off anchor
            if '#' in uri:
                req_url, anchor = uri.split('#', 1)
            else:
                req_url = uri
                anchor = None

            # handle non-ASCII URIs
            try:
                req_url.encode('ascii')
            except UnicodeError:
                req_url = encode_uri(req_url)

            try:
                if anchor and self.app.config.linkcheck_anchors and \
                   not anchor.startswith('!'):
                    # Read the whole document and see if #anchor exists
                    # (Anchors starting with ! are ignored since they are
                    # commonly used for dynamic pages)
                    req = Request(req_url)
                    f = opener.open(req, **kwargs)
                    encoding = 'utf-8'
                    if hasattr(f.headers, 'get_content_charset'):
                        encoding = f.headers.get_content_charset() or encoding
                    else:
                        encoding = get_content_charset(f) or encoding
                    found = check_anchor(TextIOWrapper(f, encoding),
                                         unquote(anchor))
                    f.close()

                    if not found:
                        raise Exception("Anchor '%s' not found" % anchor)
                else:
                    try:
                        # try a HEAD request, which should be easier on
                        # the server and the network
                        req = HeadRequest(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
                    except HTTPError as err:
                        if err.code not in (403, 405):
                            raise
                        # retry with GET if that fails, some servers
                        # don't like HEAD requests and reply with 403 or 405
                        req = Request(req_url)
                        f = opener.open(req, **kwargs)
                        f.close()
            except HTTPError as err:
                if err.code == 401:
                    # We'll take "Unauthorized" as working.
                    return 'working', ' - unauthorized', 0
                else:
                    return 'broken', str(err), 0
            except Exception as err:
                return 'broken', str(err), 0
            if f.url.rstrip('/') == req_url.rstrip('/'):
                return 'working', '', 0
            else:
                new_url = f.url
                if anchor:
                    new_url += '#' + anchor
                code = getattr(req, 'redirect_code', 0)
                return 'redirected', new_url, code