示例#1
0
 def test_detwingle_ignores_multibyte_characters(self):
     # Each of these characters has a UTF-8 representation ending
     # in \x93. \x93 is a smart quote if interpreted as
     # Windows-1252. But our code knows to skip over multibyte
     # UTF-8 characters, so they'll survive the process unscathed.
     for tricky_unicode_char in (
         "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
         "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
         "\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
         ):
         input = tricky_unicode_char.encode("utf8")
         self.assertTrue(input.endswith(b'\x93'))
         output = UnicodeDammit.detwingle(input)
         self.assertEqual(output, input)
示例#2
0
文件: test_soup.py 项目: vikatory/pyx
    def test_detwingle(self):
        # Here's a UTF8 document.
        utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")

        # Here's a Windows-1252 document.
        windows_1252 = ("\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" "\N{RIGHT DOUBLE QUOTATION MARK}").encode(
            "windows_1252"
        )

        # Through some unholy alchemy, they've been stuck together.
        doc = utf8 + windows_1252 + utf8

        # The document can't be turned into UTF-8:
        self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")

        # Unicode, Dammit thinks the whole document is Windows-1252,
        # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"

        # But if we run it through fix_embedded_windows_1252, it's fixed:

        fixed = UnicodeDammit.detwingle(doc)
        self.assertEqual("☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
示例#3
0
    def exportUrlFeeder(self, filename,urlList):# Takes as an input a list and returns nothing.
        '''
        Description: This function is used to export the urls into a flat file.
        Status: In progress - Should be moved to a separate package.
        Usage: Is used within the harvest functions as a url exporter.
        '''

        urlList  = sorted(urlList) # Sort urls so it can be more easy to read.
        fobj = open(filename,'wa')
        
        for link in range(len(urlList)):
            try:
                encodedUrl = UnicodeDammit.detwingle(urlList[link])
                encodedUrl.decode("utf8")
                fobj.write(encodedUrl) # Exports the urls in a file.Re move function-
                fobj.write('\n')
            except:
                exportFeedLogger.logError("Unexpected error while open output file in exportUrlFeeder")
                pass
        
        fobj.flush() # Flush IO buffer.
        fobj.close()# Close file.
示例#4
0
 def test_unicode_input(self):
     markup = u"I'm already Unicode! \N{SNOWMAN}"
     dammit = UnicodeDammit(markup)
     self.assertEqual(dammit.unicode_markup, markup)
 def test_byte_order_mark_removed(self):
     # A document written in UTF-16LE will have its byte order marker stripped.
     data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
     dammit = UnicodeDammit(data)
     self.assertEqual("<a>áé</a>", dammit.unicode_markup)
     self.assertEqual("utf-16le", dammit.original_encoding)
 def test_ignore_invalid_codecs(self):
     utf8_data = "Räksmörgås".encode("utf-8")
     for bad_encoding in ['.utf8', '...', 'utF---16.!']:
         dammit = UnicodeDammit(utf8_data, [bad_encoding])
         self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
 def test_ignore_inappropriate_codecs(self):
     utf8_data = "Räksmörgås".encode("utf-8")
     dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
     self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
 def test_dont_see_smart_quotes_where_there_are_none(self):
     utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
     dammit = UnicodeDammit(utf_8)
     self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
     self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
 def test_smart_quotes_to_ascii(self):
     markup = b"<foo>\x91\x92\x93\x94</foo>"
     dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
     self.assertEqual(dammit.unicode_markup, """<foo>''""</foo>""")
示例#10
0
 def test_smart_quotes_to_html_entities(self):
     markup = b"<foo>\x91\x92\x93\x94</foo>"
     dammit = UnicodeDammit(markup, smart_quotes_to="html")
     self.assertEqual(dammit.unicode_markup,
                      "<foo>&lsquo;&rsquo;&ldquo;&rdquo;</foo>")
示例#11
0
def create_soup(source) -> BeautifulSoup:
    return BeautifulSoup(UnicodeDammit(source).unicode_markup, "lxml")
示例#12
0
 def test_detect_utf8(self):
     utf8 = b"\xc3\xa9"
     dammit = UnicodeDammit(utf8)
     self.assertEqual(dammit.unicode_markup, '\xe9')
     self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
示例#13
0
 def test_convert_hebrew(self):
     hebrew = b"\xed\xe5\xec\xf9"
     dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
     assert dammit.original_encoding.lower() == 'iso-8859-8'
     assert dammit.unicode_markup == '\u05dd\u05d5\u05dc\u05e9'
示例#14
0
 def test_detect_utf8(self):
     utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
     dammit = UnicodeDammit(utf8)
     assert dammit.original_encoding.lower() == 'utf-8'
     assert dammit.unicode_markup == 'Sacr\xe9 bleu! \N{SNOWMAN}'
示例#15
0
 def test_unicode_input(self):
     markup = "I'm already Unicode! \N{SNOWMAN}"
     dammit = UnicodeDammit(markup)
     assert dammit.unicode_markup == markup
示例#16
0
 def test_smart_quotes_to_unicode(self):
     markup = b"<foo>\x91\x92\x93\x94</foo>"
     dammit = UnicodeDammit(markup)
     self.assertEqual(dammit.unicode_markup,
                      u"<foo>\u2018\u2019\u201c\u201d</foo>")
示例#17
0
def get_tree(page_url):
    resp = requests.get(base_url + page_url.lower(),
                        headers={'User-Agent': ''})
    dammit = UnicodeDammit(resp.content, ['windows-1257'])
    tree = html.fromstring(dammit.unicode_markup)
    return tree
示例#18
0
 def test_smart_quotes_to_xml_entities(self):
     markup = b"<foo>\x91\x92\x93\x94</foo>"
     dammit = UnicodeDammit(markup, smart_quotes_to="xml")
     self.assertEqual(dammit.unicode_markup,
                      "<foo>&#x2018;&#x2019;&#x201C;&#x201D;</foo>")
 def test_convert_hebrew(self):
     hebrew = b"\xed\xe5\xec\xf9"
     dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
     self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
     self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
 def test_detect_utf8(self):
     utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
     dammit = UnicodeDammit(utf8)
     self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
     self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
示例#21
0
def prop_to_trait(prop_name, prop, add_tag=True, add_help=True):
    """translate a schema to a traitlets trait"""
    # pylint: disable=too-many-locals,too-many-branches
    trait = "Any"
    args = []
    kwargs = []
    tags = []

    ptype = prop.get("type")
    any_of = prop.get("anyOf")

    if any_of:
        trait = "Union"
        args += [
            "[{}]".format(", ".join(
                prop_to_trait("", any_of_prop, add_tag=False, add_help=False)
                for any_of_prop in any_of))
        ]
    elif ptype == "object":
        trait = "Dict"
    elif ptype == "boolean":
        trait = "Bool"
    elif ptype == "string":
        const = prop.get("const")
        if const is not None:
            trait = "Enum"
            args += [f"'{const}'"]
        else:
            trait = "Unicode"
    elif ptype == "number":
        fmt = prop.get("format")
        if fmt == "int":
            trait = "Int"
        elif fmt == "float":
            trait = "Float"
        else:
            trait = "Union"
            args += ["[T.Float(), T.Int()]"]
    elif ptype == "array":
        trait = "Union"
        args += ["[T.Tuple(), T.Enum([None])]"]
    else:
        print(
            prop_name,
            "\n-----------------------------\n",
            safe_dump(prop, default_flow_style=False),
            "\n-----------------------------\n",
        )

    if add_help and prop.get("description"):
        dammit = UnicodeDammit(prop["description"])
        kwargs += [f"""help='''{dammit.unicode_markup}'''"""]

    if add_tag:
        # might need more tags
        tags += ["sync=True"]
        kwargs += ["allow_none=True", "default_value=None"]

    arg_str = ", ".join(args)
    kwarg_str = ", ".join(kwargs)
    all_args = [a for a in [arg_str, kwarg_str] if a]
    all_arg_str = ", ".join(all_args)
    tag_str = f""".tag({", ".join(tags)})""" if tags else ""

    return f"""T.{trait}({all_arg_str}){tag_str}"""