def process_unknown_code(self, response): if response.status_code != 200: return BaseService.process_unknown_code(self, response) url = self.params['url_template'].format(shortcode=self.current_shortcode) response = self.fetch_url(url, 'get') if response.status_code != 200: raise UnexpectedNoResult( "Didn't get OK on second try. Got {0} for {1}" .format(response.status_code, self.current_shortcode) ) # Copied form tinyback. I don't think code will reach here anymore match = re.search( "<a class=\"btn ignore\" href=\"(.*?)\" title=", html_unescape(response.text) ) if not match: raise UnexpectedNoResult( "Didn't get match on second try for {0}" .format(self.current_shortcode) ) return (URLStatus.ok, match.group(1), response.encoding)
def process_redirect(self, response): if response.status_code == 302: if 'location' not in response.headers: raise UnexpectedNoResult() url = urlparse.urlparse(response.headers['location']) if url.scheme != "http" or url.netloc != "bit.ly" or url.path != "/a/warning": raise UnexpectedNoResult("Unexpected Location header after HTTP status 302") if sys.version_info[0] == 2: query = urlparse.parse_qs(url.query.encode('latin-1')) else: query = urlparse.parse_qs(url.query) if not ("url" in query and len(query["url"]) == 1) or not ("hash" in query and len(query["hash"]) == 1): raise UnexpectedNoResult("Unexpected Location header after HTTP status 302") if query["hash"][0] != self.current_shortcode: raise UnexpectedNoResult("Hash mismatch for HTTP status 302") if sys.version_info[0] == 2: unshortened_url = query["url"][0].decode('latin-1') else: unshortened_url = query["url"][0] return (URLStatus.ok, unshortened_url, None) else: return BaseService.process_redirect(self, response)
def _preview(self, code, affiliate_url): response = self.fetch_url("https://tinyurl.com/preview.php?num=" + code, method='get') if response.status_code != 200: raise UnexpectedNoResult( "Unexpected HTTP status %i on preview page %s" % (response.status_code, response.url)) match = re.search( "<a id=\"redirecturl\" href=\"(.*?)\">Proceed to this site.</a>", response.text, re.DOTALL) if not match: raise UnexpectedNoResult("No redirect on preview page {0}".format( response.url)) url = match.group(1) if url == "": return self._scrub_url(code, affiliate_url) return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
def process_redirect(self, response): if 'Location' in response.headers: result_url = response.headers['Location'] if sys.version_info[0] == 2 and \ isinstance(result_url, terroroftinytown.six.binary_type): # Headers are treated as latin-1 # This is needed so that unit tests don't need to # do implicit unicode conversion. Ick! result_url = result_url.decode('latin-1') response.content # read the response to allow connection reuse return self.check_anti_regex(response, result_url, None) elif self.params.get('body_regex'): return self.process_redirect_body(response) elif self.tolerate_missing_location_header: response.content # read the response to allow connection reuse return self.process_no_redirect(response) else: response.content # read the response to allow connection reuse raise UnexpectedNoResult( 'Unexpectedly did not get a redirect result for {0}' .format(repr(response.url)) )
def _parse_errorhelp(self, response): match = re.search('<meta http-equiv="refresh" content="0;url=(.*?)">', response.text) if not match: raise UnexpectedNoResult( "No redirect on \"errorhelp\" page on HTTP status 200 for {0}". format(response.url)) url = urlparse.urlparse(match.group(1)) if url.scheme != "http" or url.netloc != "tinyurl.com" or url.path != "/errorb.php": raise UnexpectedNoResult( "Unexpected redirect on \"errorhelp\" page on HTTP status 200 for {0}" .format(response.url)) if sys.version_info[0] == 2: query = urlparse.parse_qs(url.query.encode('utf-8')) else: query = urlparse.parse_qs(url.query) if not ("url" in query and len(query["url"]) == 1) or not ("path" in query and len(query["path"]) == 1): raise UnexpectedNoResult( "Unexpected redirect on \"errorhelp\" page on HTTP status 200 for {0}" .format(response.url)) if query["path"][0] != ("/" + self.current_shortcode): raise UnexpectedNoResult( "Code mismatch on \"errorhelp\" on HTTP status 200") encoding = response.encoding if sys.version_info[0] == 2: try: result_url = query["url"][0].decode('utf-8') except UnicodeError: try: result_url = query["url"][0].decode('cp1252') encoding = 'cp1252' except UnicodeError: result_url = query["url"][0].decode('latin-1') encoding = 'latin-1' else: result_url = query["url"][0] return (URLStatus.ok, result_url, encoding)
def process_redirect_body(self, response): pattern = self.params['body_regex'] match = re.search(pattern, html_unescape(response.text)) if match: return self.check_anti_regex(response, match.group(1), response.encoding) else: raise UnexpectedNoResult( 'Unexpectedly did not get a body result for {0}' .format(repr(response.url)) )
def _parse_spam_blocklist(self, response): match = re.search("<p>This TinyURL went to: (.*?)</p>", response.text, re.DOTALL) if not match: raise UnexpectedNoResult( "No redirect on \"spam redirect\" page on HTTP status 200 for {0}" .format(response.url)) url = match.group(1) return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
def _parse_tinyurl_redirect(self, response): match = re.search( "<p class=\"intro\">The URL you followed redirects back to a TinyURL and therefore we can't directly send you to the site\\. The URL it redirects to is (?:<script>.*?</script>)?<a href=\"(.*?)\">", response.text, re.DOTALL) if not match: raise UnexpectedNoResult( "No redirect on \"tinyurl redirect\" page on HTTP status 200 for {0}" .format(response.url)) url = match.group(1) return (URLStatus.ok, html_parser.HTMLParser().unescape(url), response.encoding)
def process_unavailable(self, response): if response.status_code != 410: return BaseService.process_unavailable(self, response) match = re.search(r'was forwarding to: <BR> <font color=red>(.*)</font>', response.text) if not match: if re.search(r'This shortURL address was REMOVED for SPAMMING', response.text): return URLStatus.unavailable, None, None if not match and 'REMOVED FOR SPAMMING' in response.text: return URLStatus.unavailable, None, None if not match: raise UnexpectedNoResult( "Could not find target URL on blocked page for {0}" .format(self.current_shortcode)) url = html_unescape(match.group(1)) return URLStatus.ok, url, response.encoding
def process_unknown_code(self, response): first_status_code = response.status_code if first_status_code not in (200, 500): return BaseService.process_unknown_code(self, response) url = self.params['url_template'].format( shortcode=self.current_shortcode) response = self.fetch_url(url, 'get') second_status_code = response.status_code if second_status_code not in (200, 500): raise UnhandledStatusCode( "HTTP status changed from %s to %i on second request for %s" % (first_status_code, second_status_code, self.current_shortcode)) pattern = "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />" match = re.search(pattern, response.text) if not match: text = response.text.replace("<br />\n", "") match = re.search(pattern, text) if not match: pattern = "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />" match = re.search(pattern, response.text, re.DOTALL) if not match: raise UnexpectedNoResult( "Could not find target URL on preview page for {0}".format( self.current_shortcode)) url = html_unescape(match.group(1)) url = url.replace('\n', '').replace('\r', '') return URLStatus.ok, url, response.encoding
def process_redirect(self, response): if '<title>Spammer</title>' in response.text or \ '<title>Phisher</title>' in response.text or \ 'It has automatically been terminated.' in response.text or \ 'This link was created by a spammer' in response.text or \ 'This link was created by an unknown spammer' in response.text or \ 'This link was abused by' in response.text or \ '<title>Abuse</title>' in response.text or \ '<title>Link Removed</title>' in response.text or \ '<title>Phishing Link</title>' in response.text or \ '<title>TOS</title>' in response.text: return (URLStatus.unavailable, None, None) if not response.text.strip(): return (URLStatus.not_found, None, None) groups = re.findall((r'CONTENT="\d+;URL=(.*)(?:\r\n|">)|' '<frame src="(.*)(?:\r\n|">)|' 'rel="canonical" href="(.*)"/>'), response.text) for group in groups: text = group[0] or group[1] or group[2] link = html_unescape(text) if 'ad.adjix.com' in link: continue return (URLStatus.ok, link, response.encoding) for group in groups: text = group[0] or group[1] or group[2] link = html_unescape(text) return (URLStatus.ok, link, response.encoding) raise UnexpectedNoResult("Didn't get anything for {0}".format( self.current_shortcode))