def test_split_header_words(self): from mechanize._headersutil import split_header_words tests = [ ("foo", [[("foo", None)]]), ("foo=bar", [[("foo", "bar")]]), (" foo ", [[("foo", None)]]), (" foo= ", [[("foo", "")]]), (" foo=", [[("foo", "")]]), (" foo= ; ", [[("foo", "")]]), (" foo= ; bar= baz ", [[("foo", ""), ("bar", "baz")]]), ("foo=bar bar=baz", [[("foo", "bar"), ("bar", "baz")]]), # doesn't really matter if this next fails, but it works ATM ("foo= bar=baz", [[("foo", "bar=baz")]]), ("foo=bar;bar=baz", [[("foo", "bar"), ("bar", "baz")]]), ('foo bar baz', [[("foo", None), ("bar", None), ("baz", None)]]), ("a, b, c", [[("a", None)], [("b", None)], [("c", None)]]), (r'foo; bar=baz, spam=, foo="\,\;\"", bar= ', [[("foo", None), ("bar", "baz")], [("spam", "")], [("foo", ',;"')], [("bar", "")]]), ] for arg, expect in tests: try: result = split_header_words([arg]) except: import traceback, StringIO f = StringIO.StringIO() traceback.print_exc(None, f) result = "(error -- traceback follows)\n\n%s" % f.getvalue() assert result == expect, """ When parsing: '%s' Expected: '%s' Got: '%s' """ % (arg, expect, result)
def test_roundtrip(self): from mechanize._headersutil import split_header_words, join_header_words tests = [ ("foo", "foo"), ("foo=bar", "foo=bar"), (" foo ", "foo"), ("foo=", 'foo=""'), ("foo=bar bar=baz", "foo=bar; bar=baz"), ("foo=bar;bar=baz", "foo=bar; bar=baz"), ('foo bar baz', "foo; bar; baz"), (r'foo="\"" bar="\\"', r'foo="\""; bar="\\"'), ('foo,,,bar', 'foo, bar'), ('foo=bar,bar=baz', 'foo=bar, bar=baz'), ('text/html; charset=iso-8859-1', 'text/html; charset="iso-8859-1"'), ('foo="bar"; port="80,81"; discard, bar=baz', 'foo=bar; port="80,81"; discard, bar=baz'), (r'Basic realm="\"foo\\\\bar\""', r'Basic; realm="\"foo\\\\bar\""') ] for arg, expect in tests: input = split_header_words([arg]) res = join_header_words(input) assert res == expect, """ When parsing: '%s' Expected: '%s' Got: '%s' Input was: '%s'""" % (arg, expect, res, input)
def test_roundtrip(self): from mechanize._headersutil import split_header_words, join_header_words tests = [("foo", "foo"), ("foo=bar", "foo=bar"), (" foo ", "foo"), ("foo=", 'foo=""'), ("foo=bar bar=baz", "foo=bar; bar=baz"), ("foo=bar;bar=baz", "foo=bar; bar=baz"), ('foo bar baz', "foo; bar; baz"), (r'foo="\"" bar="\\"', r'foo="\""; bar="\\"'), ('foo,,,bar', 'foo, bar'), ('foo=bar,bar=baz', 'foo=bar, bar=baz'), ('text/html; charset=iso-8859-1', 'text/html; charset="iso-8859-1"'), ('foo="bar"; port="80,81"; discard, bar=baz', 'foo=bar; port="80,81"; discard, bar=baz'), (r'Basic realm="\"foo\\\\bar\""', r'Basic; realm="\"foo\\\\bar\""')] for arg, expect in tests: input = split_header_words([arg]) res = join_header_words(input) assert res == expect, """ When parsing: '%s' Expected: '%s' Got: '%s' Input was: '%s'""" % (arg, expect, res, input)
def test_split_header_words(self): from mechanize._headersutil import split_header_words tests = [ ("foo", [[("foo", None)]]), ("foo=bar", [[("foo", "bar")]]), (" foo ", [[("foo", None)]]), (" foo= ", [[("foo", "")]]), (" foo=", [[("foo", "")]]), (" foo= ; ", [[("foo", "")]]), (" foo= ; bar= baz ", [[("foo", ""), ("bar", "baz")]]), ("foo=bar bar=baz", [[("foo", "bar"), ("bar", "baz")]]), # doesn't really matter if this next fails, but it works ATM ("foo= bar=baz", [[("foo", "bar=baz")]]), ("foo=bar;bar=baz", [[("foo", "bar"), ("bar", "baz")]]), ('foo bar baz', [[("foo", None), ("bar", None), ("baz", None)]]), ("a, b, c", [[("a", None)], [("b", None)], [("c", None)]]), (r'foo; bar=baz, spam=, foo="\,\;\"", bar= ', [[("foo", None), ("bar", "baz")], [("spam", "")], [("foo", ',;"')], [("bar", "")]]), ] for arg, expect in tests: try: result = split_header_words([arg]) except Exception: import traceback from io import StringIO f = StringIO() traceback.print_exc(None, f) result = "(error -- traceback follows)\n\n%s" % f.getvalue() assert result == expect, """ When parsing: '%s' Expected: '%s' Got: '%s' """ % (arg, expect, result)
def urlinfo(self, url, maxback=2): if urlparse.urlsplit(url).netloc == 'mobile.twitter.com': url = url.replace('mobile.twitter.com', 'twitter.com', 1) try: r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2) body = False except BrowserUnavailable as e: if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode( e): r = self.openurl(url, _tries=2, _delay=0.2) body = True elif u'HTTP Error 404' in unicode(e) \ and maxback and not url[-1].isalnum(): return self.urlinfo(url[:-1], maxback - 1) else: raise e headers = r.info() content_type = headers.get('Content-Type') try: size = int(headers.get('Content-Length')) hsize = self.human_size(size) except TypeError: size = None hsize = None is_html = headersutil.is_html([content_type], url, True) title = None if is_html: if not body: r = self.openurl(url, _tries=2, _delay=0.2) # update size has we might not have it from headers size = len(r.read()) hsize = self.human_size(size) r.seek(0) encoding = EncodingFinder('windows-1252').encoding(r).lower() try: h = self.get_document(r, parser='lxml', encoding=encoding) for meta in h.xpath('//head/meta'): # meta http-equiv=content-type content=... if meta.attrib.get('http-equiv', '').lower() == 'content-type': for k, v in headersutil.split_header_words( [meta.attrib.get('content', '')]): if k == 'charset': encoding = v # meta charset=... encoding = meta.attrib.get('charset', encoding).lower() except Exception as e: print e finally: r.seek(0) if encoding == 'iso-8859-1' or not encoding: encoding = 'windows-1252' try: codecs.lookup(encoding) except LookupError: encoding = 'windows-1252' try: h = self.get_document(r, parser='lxml', encoding=encoding) for title in h.xpath('//head/title'): title = to_unicode(title.text_content()).strip() title = ' '.join(title.split()) if urlparse.urlsplit(url).netloc.endswith('twitter.com'): for title in h.getroot().cssselect( '.permalink-tweet .tweet-text'): title = to_unicode(title.text_content()).strip() title = ' '.join(title.splitlines()) except AssertionError as e: # invalid HTML print e return content_type, hsize, title
def urlinfo(self, url, maxback=2): if urlparse.urlsplit(url).netloc == 'mobile.twitter.com': url = url.replace('mobile.twitter.com', 'twitter.com', 1) try: r = self.openurl(HeadRequest(url), _tries=2, _delay=0.2) body = False except BrowserUnavailable as e: if u'HTTP Error 501' in unicode(e) or u'HTTP Error 405' in unicode(e): r = self.openurl(url, _tries=2, _delay=0.2) body = True elif u'HTTP Error 404' in unicode(e) \ and maxback and not url[-1].isalnum(): return self.urlinfo(url[:-1], maxback-1) else: raise e headers = r.info() content_type = headers.get('Content-Type') try: size = int(headers.get('Content-Length')) hsize = self.human_size(size) except TypeError: size = None hsize = None is_html = headersutil.is_html([content_type], url, True) title = None if is_html: if not body: r = self.openurl(url, _tries=2, _delay=0.2) # update size has we might not have it from headers size = len(r.read()) hsize = self.human_size(size) r.seek(0) encoding = EncodingFinder('windows-1252').encoding(r).lower() try: h = self.get_document(r, parser='lxml', encoding=encoding) for meta in h.xpath('//head/meta'): # meta http-equiv=content-type content=... if meta.attrib.get('http-equiv', '').lower() == 'content-type': for k, v in headersutil.split_header_words([meta.attrib.get('content', '')]): if k == 'charset': encoding = v # meta charset=... encoding = meta.attrib.get('charset', encoding).lower() except Exception as e: print e finally: r.seek(0) if encoding == 'iso-8859-1' or not encoding: encoding = 'windows-1252' try: codecs.lookup(encoding) except LookupError: encoding = 'windows-1252' try: h = self.get_document(r, parser='lxml', encoding=encoding) for title in h.xpath('//head/title'): title = to_unicode(title.text_content()).strip() title = ' '.join(title.split()) if urlparse.urlsplit(url).netloc.endswith('twitter.com'): for title in h.getroot().cssselect('.permalink-tweet .tweet-text'): title = to_unicode(title.text_content()).strip() title = ' '.join(title.splitlines()) except AssertionError as e: # invalid HTML print e return content_type, hsize, title