def _preview(self, code, affiliate_url): response = self.fetch_url("https://tinyurl.com/preview.php?num=" + code, method='get') if response.status_code != 200: raise UnexpectedNoResult( "Unexpected HTTP status %i on preview page %s" % (response.status_code, response.url)) match = re.search( "<a id=\"redirecturl\" href=\"(.*?)\">Proceed to this site.</a>", response.text, re.DOTALL) if not match: raise UnexpectedNoResult("No redirect on preview page {0}".format( response.url)) url = match.group(1) if url == "": return self._scrub_url(code, affiliate_url) return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
def _parse_spam_blocklist(self, response): match = re.search("<p>This TinyURL went to: (.*?)</p>", response.text, re.DOTALL) if not match: raise UnexpectedNoResult( "No redirect on \"spam redirect\" page on HTTP status 200 for {0}" .format(response.url)) url = match.group(1) return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
def parse_preview(self, response): response.encoding = 'utf-8' match = re.search( "<b>Click the link</b> if you'd like to proceed to the destination shown: -<br /><a href=\"(.*)\" class=\"biglink\">", response.text) if not match: raise errors.UnexpectedNoResult( "Could not find target URL in 'Preview' page") url = match.group(1) return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
def _parse_tinyurl_redirect(self, response): match = re.search( "<p class=\"intro\">The URL you followed redirects back to a TinyURL and therefore we can't directly send you to the site\\. The URL it redirects to is (?:<script>.*?</script>)?<a href=\"(.*?)\">", response.text, re.DOTALL) if not match: raise UnexpectedNoResult( "No redirect on \"tinyurl redirect\" page on HTTP status 200 for {0}" .format(response.url)) url = match.group(1) return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
def parse_blocked(self, response): response.encoding = 'utf-8' match = re.search( "<p>For reference and to help those fighting spam the original destination of this URL is given below \(we strongly recommend you don't visit it since it may damage your PC\): -<br />(.*)</p><h2>is\.gd</h2><p>is\.gd is a free service used to shorten long URLs\.", response.text) if not match: raise errors.UnexpectedNoResult( "Could not find target URL in 'Link Disabled' page") url = match.group(1) url = html_parser.HTMLParser().unescape(url) if url == "": return (URLStatus.unavailable, None, None) return (URLStatus.ok, url, response.encoding)
) def process_connection_error(self, exception): ex_args = repr(exception.args) if 'ProtocolError' in ex_args or 'Invalid IPv6 URL' in ex_args: raise MalformedResponse( 'Malformed response: {0}'.format(ex_args)) else: raise PleaseRetry('Connection error: {0}'.format(ex_args)) def check_anti_regex(self, response, result_url, encoding): if not result_url or self.matches_anti_regex(result_url): return self.process_no_redirect(response) else: return (URLStatus.ok, result_url, encoding) def matches_anti_regex(self, result_url): anti_regex = self.params.get('location_anti_regex') return (anti_regex and re.search(anti_regex, result_url)) class DefaultService(BaseService): pass _html_parser_unescaper = html_parser.HTMLParser() def html_unescape(text): return _html_parser_unescaper.unescape(text)