def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending # in \x93. \x93 is a smart quote if interpreted as # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) output = UnicodeDammit.detwingle(input) self.assertEqual(output, input)
def test_detwingle(self): # Here's a UTF8 document. utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ("\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" "\N{RIGHT DOUBLE QUOTATION MARK}").encode( "windows_1252" ) # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 # The document can't be turned into UTF-8: self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") # Unicode, Dammit thinks the whole document is Windows-1252, # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" # But if we run it through fix_embedded_windows_1252, it's fixed: fixed = UnicodeDammit.detwingle(doc) self.assertEqual("☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def exportUrlFeeder(self, filename,urlList):# Takes as an input a list and returns nothing. ''' Description: This function is used to export the urls into a flat file. Status: In progress - Should be moved to a separate package. Usage: Is used within the harvest functions as a url exporter. ''' urlList = sorted(urlList) # Sort urls so it can be more easy to read. fobj = open(filename,'wa') for link in range(len(urlList)): try: encodedUrl = UnicodeDammit.detwingle(urlList[link]) encodedUrl.decode("utf8") fobj.write(encodedUrl) # Exports the urls in a file.Re move function- fobj.write('\n') except: exportFeedLogger.logError("Unexpected error while open output file in exportUrlFeeder") pass fobj.flush() # Flush IO buffer. fobj.close()# Close file.
def test_unicode_input(self): markup = u"I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, markup)
def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual("<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
def test_ignore_invalid_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_inappropriate_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_smart_quotes_to_ascii(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="ascii") self.assertEqual(dammit.unicode_markup, """<foo>''""</foo>""")
def test_smart_quotes_to_html_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="html") self.assertEqual(dammit.unicode_markup, "<foo>‘’“”</foo>")
def create_soup(source) -> BeautifulSoup: return BeautifulSoup(UnicodeDammit(source).unicode_markup, "lxml")
def test_detect_utf8(self): utf8 = b"\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.unicode_markup, '\xe9') self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) assert dammit.original_encoding.lower() == 'iso-8859-8' assert dammit.unicode_markup == '\u05dd\u05d5\u05dc\u05e9'
def test_detect_utf8(self): utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) assert dammit.original_encoding.lower() == 'utf-8' assert dammit.unicode_markup == 'Sacr\xe9 bleu! \N{SNOWMAN}'
def test_unicode_input(self): markup = "I'm already Unicode! \N{SNOWMAN}" dammit = UnicodeDammit(markup) assert dammit.unicode_markup == markup
def test_smart_quotes_to_unicode(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup) self.assertEqual(dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
def get_tree(page_url): resp = requests.get(base_url + page_url.lower(), headers={'User-Agent': ''}) dammit = UnicodeDammit(resp.content, ['windows-1257']) tree = html.fromstring(dammit.unicode_markup) return tree
def test_smart_quotes_to_xml_entities(self): markup = b"<foo>\x91\x92\x93\x94</foo>" dammit = UnicodeDammit(markup, smart_quotes_to="xml") self.assertEqual(dammit.unicode_markup, "<foo>‘’“”</foo>")
def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_detect_utf8(self): utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
def prop_to_trait(prop_name, prop, add_tag=True, add_help=True): """translate a schema to a traitlets trait""" # pylint: disable=too-many-locals,too-many-branches trait = "Any" args = [] kwargs = [] tags = [] ptype = prop.get("type") any_of = prop.get("anyOf") if any_of: trait = "Union" args += [ "[{}]".format(", ".join( prop_to_trait("", any_of_prop, add_tag=False, add_help=False) for any_of_prop in any_of)) ] elif ptype == "object": trait = "Dict" elif ptype == "boolean": trait = "Bool" elif ptype == "string": const = prop.get("const") if const is not None: trait = "Enum" args += [f"'{const}'"] else: trait = "Unicode" elif ptype == "number": fmt = prop.get("format") if fmt == "int": trait = "Int" elif fmt == "float": trait = "Float" else: trait = "Union" args += ["[T.Float(), T.Int()]"] elif ptype == "array": trait = "Union" args += ["[T.Tuple(), T.Enum([None])]"] else: print( prop_name, "\n-----------------------------\n", safe_dump(prop, default_flow_style=False), "\n-----------------------------\n", ) if add_help and prop.get("description"): dammit = UnicodeDammit(prop["description"]) kwargs += [f"""help='''{dammit.unicode_markup}'''"""] if add_tag: # might need more tags tags += ["sync=True"] kwargs += ["allow_none=True", "default_value=None"] arg_str = ", ".join(args) kwarg_str = ", ".join(kwargs) all_args = [a for a in [arg_str, kwarg_str] if a] all_arg_str = ", ".join(all_args) tag_str = f""".tag({", ".join(tags)})""" if tags else "" return f"""T.{trait}({all_arg_str}){tag_str}"""