Exemplo n.º 1
0
    def test_typed_dict(self):
        self.uut = Setting("key", "1, 2: t, 3")
        self.assertEqual(typed_dict(int, str, None)(self.uut),
                         {1: None, 2: "t", 3: None})

        with self.assertRaises(ValueError):
            self.uut = Setting("key", "1, a, 3")
            typed_dict(int, str, "")(self.uut)
Exemplo n.º 2
0
    def test_typed_dict(self):
        self.uut = Setting('key', '1, 2: t, 3')
        self.assertEqual(typed_dict(int, str, None)(self.uut),
                         {1: None, 2: 't', 3: None})

        with self.assertRaises(ValueError):
            self.uut = Setting('key', '1, a, 3')
            typed_dict(int, str, '')(self.uut)
Exemplo n.º 3
0
    def test_typed_dict(self):
        self.uut = Setting("key", "1, 2: t, 3")
        self.assertEqual(typed_dict(int, str, None)(self.uut),
                         {1: None, 2: "t", 3: None})

        with self.assertRaises(ValueError):
            self.uut = Setting("key", "1, a, 3")
            typed_dict(int, str, "")(self.uut)
Exemplo n.º 4
0
    def run(self, filename, file,
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(),
            link_ignore_regex: str='([.\/]example\.com|\{|\$)',
            link_ignore_list: typed_list(str)=''):
        """
        Find links in any text file.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout:       A dict mapping URLs and timeout to be
                                      used for that URL. All the URLs that have
                                      the same host as that of URLs provided
                                      will be passed that timeout. It can also
                                      contain a wildcard timeout entry with key
                                      '*'. The timeout of all the websites not
                                      in the dict will be the value of the key
                                      '*'.
        :param link_ignore_regex:     A regex for urls to ignore.
        :param link_ignore_list: Comma separated url globs to ignore
        """
        network_timeout = {urlparse(url).netloc
                           if not url == '*' else '*': timeout
                           for url, timeout in network_timeout.items()}

        for line_number, link, code, context in self.analyze_links_in_file(
                file, network_timeout, link_ignore_regex, link_ignore_list):
            affected_code = SourceRange.from_values(filename, line_number)

            yield URLResult(self, (affected_code,), link, code, context)
Exemplo n.º 5
0
    def test_typed_dict(self):
        self.uut = Setting('key', '1, 2: t, 3')
        self.assertEqual(
            typed_dict(int, str, None)(self.uut), {
                1: None,
                2: 't',
                3: None
            })

        with self.assertRaises(ValueError):
            self.uut = Setting('key', '1, a, 3')
            typed_dict(int, str, '')(self.uut)

        self.assertRegex(
            repr(typed_dict(int, str, None)),
            'typed_dict\\(int, str, default=None\\) at \\(0x[a-fA-F0-9]+\\)')
Exemplo n.º 6
0
    def run(self,
            filename,
            file,
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(),
            link_ignore_regex: str = '([.\/]example\.com|\{|\$)',
            link_ignore_list: typed_list(str) = ''):
        """
        Find links in any text file.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout:       A dict mapping URLs and timeout to be
                                      used for that URL. All the URLs that have
                                      the same host as that of URLs provided
                                      will be passed that timeout. It can also
                                      contain a wildcard timeout entry with key
                                      '*'. The timeout of all the websites not
                                      in the dict will be the value of the key
                                      '*'.
        :param link_ignore_regex:     A regex for urls to ignore.
        :param link_ignore_list: Comma separated url globs to ignore
        """
        network_timeout = {
            urlparse(url).netloc if not url == '*' else '*': timeout
            for url, timeout in network_timeout.items()
        }

        for line_number, link, code, context in self.analyze_links_in_file(
                file, network_timeout, link_ignore_regex, link_ignore_list):
            yield HiddenResult(self, [line_number, link, code, context])
Exemplo n.º 7
0
    def run(
            self,
            filename,
            file,
            dependency_results=dict(),
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(),
    ):
        """
        Find links in any text file and tells its head response and
        status code.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout: A dict mapping URLs and timeout to be
                                used for that URL. All the URLs that have
                                the same host as that of URLs provided
                                will be passed that timeout. It can also
                                contain a wildcard timeout entry with key
                                '*'. The timeout of all the websites not
                                in the dict will be the value of the key
                                '*'.
        :param link_ignore_regex: A regex for urls to ignore.
        :param link_ignore_list: Comma separated url globs to ignore
        """
        network_timeout = {
            urlparse(url).netloc if not url == '*' else '*': timeout
            for url, timeout in network_timeout.items()
        }

        for result in dependency_results.get(URLBear.name, []):
            host = urlparse(result.link).netloc
            head_resp = self.get_head_response(
                result.link,
                network_timeout.get(host)
                if host in network_timeout else network_timeout.get('*')
                if '*' in network_timeout else URLHeadBear.DEFAULT_TIMEOUT)

            yield URLHeadResult(self, result.affected_code, result.link,
                                head_resp, result.link_context)
Exemplo n.º 8
0
    def run(self, filename, file, dependency_results=dict(),
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(),
            ):
        """
        Find links in any text file and tells its head response and
        status code.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout: A dict mapping URLs and timeout to be
                                used for that URL. All the URLs that have
                                the same host as that of URLs provided
                                will be passed that timeout. It can also
                                contain a wildcard timeout entry with key
                                '*'. The timeout of all the websites not
                                in the dict will be the value of the key
                                '*'.
        :param link_ignore_regex: A regex for urls to ignore.
        :param link_ignore_list: Comma separated url globs to ignore
        """
        network_timeout = {urlparse(url).netloc
                           if not url == '*' else '*': timeout
                           for url, timeout in network_timeout.items()}

        for result in dependency_results.get(URLBear.name, []):
            host = urlparse(result.link).netloc
            head_resp = self.get_head_response(
                result.link,
                network_timeout.get(host)
                if host in network_timeout
                else network_timeout.get('*')
                if '*' in network_timeout
                else URLHeadBear.DEFAULT_TIMEOUT)

            yield URLHeadResult(self, result.affected_code, result.link,
                                head_resp, result.link_context)
Exemplo n.º 9
0
    def run(self, filename, file,
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT)=dict(),
            link_ignore_regex: str='([.\/]example\.com|\{|\$)',
            link_ignore_list: typed_list(str)='',
            follow_redirects: bool=False):
        """
        Find links in any text file and check if they are valid.

        A link is considered valid if the server responds with a 2xx code.

        This bear can automatically fix redirects, but ignores redirect
        URLs that have a huge difference with the original URL.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout:       A dict mapping URLs and timeout to be
                                      used for that URL. All the URLs that have
                                      the same host as that of URLs provided
                                      will be passed that timeout. It can also
                                      contain a wildcard timeout entry with key
                                      '*'. The timeout of all the websites not
                                      in the dict will be the value of the key
                                      '*'.
        :param link_ignore_regex:     A regex for urls to ignore.
        :param link_ignore_list: Comma separated url globs to ignore
        :param follow_redirects: Set to true to autocorrect redirects.
        """
        network_timeout = {urlparse(url).netloc
                           if not url == '*' else '*': timeout
                           for url, timeout in network_timeout.items()}

        for line_number, link, code, context in self.analyze_links_in_file(
                file, network_timeout, link_ignore_regex, link_ignore_list):
            if context.xml_namespace:
                if code and 200 <= code < 300:
                    pass
                else:
                    yield Result.from_values(
                        origin=self,
                        message=('XML Namespace - '
                                 '{url}').format(url=link),
                        file=filename,
                        line=line_number,
                        severity=RESULT_SEVERITY.INFO)
            elif code is None:
                yield Result.from_values(
                    origin=self,
                    message=('Broken link - unable to connect to '
                             '{url}').format(url=link),
                    file=filename,
                    line=line_number,
                    severity=RESULT_SEVERITY.MAJOR)
            elif not 200 <= code < 300:
                # HTTP status 404, 410 or 50x
                if code in (404, 410) or 500 <= code < 600:
                    yield Result.from_values(
                        origin=self,
                        message=('Broken link - unable to connect to {url} '
                                 '(HTTP Error: {code})'
                                 ).format(url=link, code=code),
                        file=filename,
                        line=line_number,
                        severity=RESULT_SEVERITY.NORMAL)
                if follow_redirects and 300 <= code < 400:  # HTTP status 30x
                    redirect_url = requests.head(link,
                                                 allow_redirects=True).url
                    matcher = SequenceMatcher(
                        None, redirect_url, link)
                    if (matcher.real_quick_ratio() > 0.7 and
                            matcher.ratio()) > 0.7:
                        diff = Diff(file)
                        current_line = file[line_number - 1]
                        start = current_line.find(link)
                        end = start + len(link)
                        replacement = current_line[:start] + \
                            redirect_url + current_line[end:]
                        diff.change_line(line_number,
                                         current_line,
                                         replacement)

                        yield Result.from_values(
                            self,
                            'This link redirects to ' + redirect_url,
                            diffs={filename: diff},
                            file=filename,
                            line=line_number,
                            severity=RESULT_SEVERITY.NORMAL)
Exemplo n.º 10
0
    def run(self,
            filename,
            file,
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(),
            link_ignore_regex: str = '([.\/]example\.com|\{|\$)',
            link_ignore_list: typed_list(str) = '',
            follow_redirects: bool = False):
        """
        Find links in any text file and check if they are valid.

        A link is considered valid if the server responds with a 2xx code.

        This bear can automatically fix redirects, but ignores redirect
        URLs that have a huge difference with the original URL.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout:       A dict mapping URLs and timeout to be
                                      used for that URL. All the URLs that have
                                      the same host as that of URLs provided
                                      will be passed that timeout. It can also
                                      contain a wildcard timeout entry with key
                                      '*'. The timeout of all the websites not
                                      in the dict will be the value of the key
                                      '*'.
        :param link_ignore_regex:     A regex for urls to ignore.
        :param link_ignore_list: Comma separated url globs to ignore
        :param follow_redirects: Set to true to autocorrect redirects.
        """
        network_timeout = {
            urlparse(url).netloc if not url == '*' else '*': timeout
            for url, timeout in network_timeout.items()
        }

        for line_number, link, code in InvalidLinkBear.find_links_in_file(
                file, network_timeout, link_ignore_regex, link_ignore_list):
            if code is None:
                yield Result.from_values(
                    origin=self,
                    message=('Broken link - unable to connect to '
                             '{url}').format(url=link),
                    file=filename,
                    line=line_number,
                    severity=RESULT_SEVERITY.MAJOR)
            elif not 200 <= code < 300:
                # HTTP status 404, 410 or 50x
                if code in (404, 410) or 500 <= code < 600:
                    yield Result.from_values(
                        origin=self,
                        message=('Broken link - unable to connect to {url} '
                                 '(HTTP Error: {code})').format(url=link,
                                                                code=code),
                        file=filename,
                        line=line_number,
                        severity=RESULT_SEVERITY.NORMAL)
                if follow_redirects and 300 <= code < 400:  # HTTP status 30x
                    redirect_url = requests.head(link,
                                                 allow_redirects=True).url
                    matcher = SequenceMatcher(None, redirect_url, link)
                    if (matcher.real_quick_ratio() > 0.7
                            and matcher.ratio()) > 0.7:
                        diff = Diff(file)
                        current_line = file[line_number - 1]
                        start = current_line.find(link)
                        end = start + len(link)
                        replacement = current_line[:start] + \
                            redirect_url + current_line[end:]
                        diff.change_line(line_number, current_line,
                                         replacement)

                        yield Result.from_values(
                            self,
                            'This link redirects to ' + redirect_url,
                            diffs={filename: diff},
                            file=filename,
                            line=line_number,
                            severity=RESULT_SEVERITY.NORMAL)
Exemplo n.º 11
0
    def run(self, filename, file, dependency_results=dict(),
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict()):
        """
        Find http links in any text file and check if the https version of
        link is valid. If so, an option is provided for replacing them with
        https.

        An https link is considered valid if the server responds with a 2xx
        code.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout:       A dict mapping URLs and timeout to be
                                      used for that URL. All the URLs that have
                                      the same host as that of URLs provided
                                      will be passed that timeout. It can also
                                      contain a wildcard timeout entry with key
                                      '*'. The timeout of all the websites not
                                      in the dict will be the value of the key
                                      '*'.
        """
        for result in dependency_results.get(URLHeadBear.name, []):
            line_number, link, code, context = result.contents
            if link.startswith(self.HTTPS_PREFIX):
                continue

            https_link = self.HTTPS_PREFIX + link[len(self.HTTP_PREFIX):]
            host = urlparse(https_link).netloc
            network_timeout = {
                urlparse(url).netloc if not url == '*' else '*': timeout
                for url, timeout in network_timeout.items()}
            https_response = URLHeadBear.get_head_response(
                https_link,
                network_timeout.get(host)
                if host in network_timeout
                else network_timeout.get('*')
                if '*' in network_timeout
                else HTTPSBear.DEFAULT_TIMEOUT)

            try:
                https_code = https_response.status_code
            except AttributeError:
                continue

            if not https_code or not 200 <= https_code < 300:
                continue

            diff = Diff(file)
            current_line = file[line_number - 1]
            start = current_line.find(link)
            end = start + len(link)
            replacement = (current_line[:start] + 'https' +
                           link[len(self.HTTP_PREFIX):] + current_line[end:])
            diff.change_line(line_number, current_line, replacement)

            yield Result.from_values(
                origin=self,
                message='https can be used instead of http',
                diffs={filename: diff},
                file=filename,
                line=line_number,
                severity=RESULT_SEVERITY.NORMAL)
Exemplo n.º 12
0
    def run(self,
            filename,
            file,
            dependency_results=dict(),
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict()):
        """
        Find http links in any text file and check if the https version of
        link is valid. If so, an option is provided for replacing them with
        https.

        An https link is considered valid if the server responds with a 2xx
        code.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout:       A dict mapping URLs and timeout to be
                                      used for that URL. All the URLs that have
                                      the same host as that of URLs provided
                                      will be passed that timeout. It can also
                                      contain a wildcard timeout entry with key
                                      '*'. The timeout of all the websites not
                                      in the dict will be the value of the key
                                      '*'.
        """
        for result in dependency_results.get(URLHeadBear.name, []):
            line_number, link, code, context = result.contents
            if link.startswith(self.HTTPS_PREFIX):
                continue

            https_link = self.HTTPS_PREFIX + link[len(self.HTTP_PREFIX):]
            host = urlparse(https_link).netloc
            network_timeout = {
                urlparse(url).netloc if not url == '*' else '*': timeout
                for url, timeout in network_timeout.items()
            }
            https_response = URLHeadBear.get_head_response(
                https_link,
                network_timeout.get(host)
                if host in network_timeout else network_timeout.get('*')
                if '*' in network_timeout else HTTPSBear.DEFAULT_TIMEOUT)

            try:
                https_code = https_response.status_code
            except AttributeError:
                continue

            if not https_code or not 200 <= https_code < 300:
                continue

            diff = Diff(file)
            current_line = file[line_number - 1]
            start = current_line.find(link)
            end = start + len(link)
            replacement = (current_line[:start] + 'https' +
                           link[len(self.HTTP_PREFIX):] + current_line[end:])
            diff.change_line(line_number, current_line, replacement)

            yield Result.from_values(
                origin=self,
                message='https can be used instead of http',
                diffs={filename: diff},
                file=filename,
                line=line_number,
                severity=RESULT_SEVERITY.NORMAL)
Exemplo n.º 13
0
    def run(self,
            filename,
            file,
            network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(),
            link_ignore_regex: str = '([.\/]example\.com|\{|\$)',
            link_ignore_list: typed_list(str) = DEFAULT_IGNORE,
            follow_redirects: bool = True):
        """
        Find links in any text file and check if they are archived.

        Link is considered valid if the link has been archived by any services
        in memento_client.

        This bear can automatically fix redirects.

        Warning: This bear will make HEAD requests to all URLs mentioned in
        your codebase, which can potentially be destructive. As an example,
        this bear would naively just visit the URL from a line that goes like
        `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out
        all your data.

        :param network_timeout:    A dict mapping URLs and timeout to be
                                   used for that URL. All the URLs that have
                                   the same host as that of URLs provided
                                   will be passed that timeout. It can also
                                   contain a wildcard timeout entry with key
                                   '*'. The timeout of all the websites not
                                   in the dict will be the value of the key
                                   '*'.
        :param link_ignore_regex:  A regex for urls to ignore.
        :param link_ignore_list:   Comma separated url globs to ignore.
        :param follow_redirects:   Set to true to check all redirect urls.
        """
        self._mc = MementoClient()

        network_timeout = {
            urlparse(url).netloc if not url == '*' else '*': timeout
            for url, timeout in network_timeout.items()
        }

        if link_ignore_list != self.DEFAULT_IGNORE:
            link_ignore_list.extend(self.DEFAULT_IGNORE)

        for (line_number, link, code,
             context) in self.analyze_links_in_file(file, network_timeout,
                                                    link_ignore_regex,
                                                    link_ignore_list):
            status = MementoBear.check_archive(self._mc, link)
            if not status:
                yield Result.from_values(
                    self,
                    ('This link is not archived yet, visit '
                     'https://web.archive.org/save/%s to get it archived.' %
                     link),
                    file=filename,
                    line=line_number,
                    severity=RESULT_SEVERITY.INFO)

            if follow_redirects and 300 <= code < 400:  # HTTP status 30x
                redirect_urls = MementoBear.get_redirect_urls(link)

                for url in redirect_urls:
                    status = MementoBear.check_archive(self._mc, url)
                    if not status:
                        yield Result.from_values(
                            self,
                            ('This link redirects to %s and not archived yet, '
                             'visit https://web.archive.org/save/%s to get it '
                             'archived.' % (url, url)),
                            file=filename,
                            line=line_number,
                            severity=RESULT_SEVERITY.INFO)