示例#1
0
    def _preview(self, code, affiliate_url):
        response = self.fetch_url("https://tinyurl.com/preview.php?num=" +
                                  code,
                                  method='get')

        if response.status_code != 200:
            raise UnexpectedNoResult(
                "Unexpected HTTP status %i on preview page %s" %
                (response.status_code, response.url))

        match = re.search(
            "<a id=\"redirecturl\" href=\"(.*?)\">Proceed to this site.</a>",
            response.text, re.DOTALL)

        if not match:
            raise UnexpectedNoResult("No redirect on preview page {0}".format(
                response.url))

        url = match.group(1)

        if url == "":
            return self._scrub_url(code, affiliate_url)

        return (URLStatus.ok, html_parser.HTMLParser().unescape(url),
                response.encoding)
示例#2
0
    def _parse_spam_blocklist(self, response):
        match = re.search("<p>This TinyURL went to: (.*?)</p>", response.text,
                          re.DOTALL)

        if not match:
            raise UnexpectedNoResult(
                "No redirect on \"spam redirect\" page on HTTP status 200 for {0}"
                .format(response.url))

        url = match.group(1)

        return (URLStatus.ok, html_parser.HTMLParser().unescape(url),
                response.encoding)
示例#3
0
    def parse_preview(self, response):
        response.encoding = 'utf-8'

        match = re.search(
            "<b>Click the link</b> if you'd like to proceed to the destination shown: -<br /><a href=\"(.*)\" class=\"biglink\">",
            response.text)
        if not match:
            raise errors.UnexpectedNoResult(
                "Could not find target URL in 'Preview' page")

        url = match.group(1)
        return (URLStatus.ok, html_parser.HTMLParser().unescape(url),
                response.encoding)
示例#4
0
    def _parse_tinyurl_redirect(self, response):
        match = re.search(
            "<p class=\"intro\">The URL you followed redirects back to a TinyURL and therefore we can't directly send you to the site\\. The URL it redirects to is (?:<script>.*?</script>)?<a href=\"(.*?)\">",
            response.text, re.DOTALL)

        if not match:
            raise UnexpectedNoResult(
                "No redirect on \"tinyurl redirect\" page on HTTP status 200 for {0}"
                .format(response.url))

        url = match.group(1)

        return (URLStatus.ok, html_parser.HTMLParser().unescape(url),
                response.encoding)
示例#5
0
    def parse_blocked(self, response):
        response.encoding = 'utf-8'

        match = re.search(
            "<p>For reference and to help those fighting spam the original destination of this URL is given below \(we strongly recommend you don't visit it since it may damage your PC\): -<br />(.*)</p><h2>is\.gd</h2><p>is\.gd is a free service used to shorten long URLs\.",
            response.text)
        if not match:
            raise errors.UnexpectedNoResult(
                "Could not find target URL in 'Link Disabled' page")

        url = match.group(1)
        url = html_parser.HTMLParser().unescape(url)
        if url == "":
            return (URLStatus.unavailable, None, None)
        return (URLStatus.ok, url, response.encoding)
示例#6
0
        )

    def process_connection_error(self, exception):
        ex_args = repr(exception.args)
        if 'ProtocolError' in ex_args or 'Invalid IPv6 URL' in ex_args:
            raise MalformedResponse(
                'Malformed response: {0}'.format(ex_args))
        else:
            raise PleaseRetry('Connection error: {0}'.format(ex_args))

    def check_anti_regex(self, response, result_url, encoding):
        if not result_url or self.matches_anti_regex(result_url):
            return self.process_no_redirect(response)
        else:
            return (URLStatus.ok, result_url, encoding)

    def matches_anti_regex(self, result_url):
        anti_regex = self.params.get('location_anti_regex')
        return (anti_regex and re.search(anti_regex, result_url))


class DefaultService(BaseService):
    pass


_html_parser_unescaper = html_parser.HTMLParser()


def html_unescape(text):
    return _html_parser_unescaper.unescape(text)