Python UnicodeDammit.encode示例

编程语言: Python

命名空间/包名称: bs4

类/类型: UnicodeDammit

方法/功能: encode

hotexamples.com的示例: 2

Python UnicodeDammit.encode - 已找到2个示例。这些是从开源项目中提取的最受好评的bs4.UnicodeDammit.encode现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

UnicodeDammit(30)

detwingle(21)

strip(10)

split(8)

replace(4)

lower(3)

splitlines(3)

encode(2)

startswith(2)

append(1)

decode(1)

endswith(1)

lstrip(1)

rstrip(1)

translate(1)

xpath(1)

示例#1

显示文件

def page_changed(fp: str, new_content: str) -> Tuple[bool, str, str]:
    """
    Compares hash of html file content to new file content.
    Returns:
        bool: whether change
        str: new content (formatted)
        str: encoding code
    """
    # read binary of existing html and use it to get probable encoding
    try:
        with open(fp, "rb") as file:
            file_bytes = file.read()
        orig_encoding = chardet.detect(file_bytes)["encoding"]

        # read existing html using the above encoding and generate a hash
        with open(fp, encoding=orig_encoding) as file:
            file_content = UnicodeDammit(file.read()).unicode_markup
        old_hash = hashlib.sha256(
            file_content.encode(orig_encoding)).hexdigest()

        # generate hash from new html
        formatted_new_content = UnicodeDammit(new_content).unicode_markup
        new_hash = hashlib.sha256(
            formatted_new_content.encode(orig_encoding)).hexdigest()

        print(old_hash, new_hash)
        return old_hash != new_hash, formatted_new_content, orig_encoding

    except FileNotFoundError:
        return True, UnicodeDammit(new_content).unicode_markup, "utf8"

示例#2

显示文件

 def getURL(self, url, uid):
     if not os.path.isfile(str(uid) + ".html"):
         with open(str(uid) + ".html", "w") as fptr:
             try:
                 response = self.opener.open(url)
             except:
                 return None
             data = response.read()
             try:
                 decoded = UnicodeDammit(
                     gzip.GzipFile(fileobj=io.BytesIO(data)).read(),
                     ["windows-1252"],
                     smart_quotes_to="html").unicode_markup
             except:
                 decoded = UnicodeDammit(
                     data, ["windows-1252"],
                     smart_quotes_to="html").unicode_markup
             decoded = decoded.replace(u"%20", u" ").replace(
                 u"\u00c2",
                 u" ").replace(u"\xe2&euro;&trade;", u"\'").replace(
                     u"\xe2&euro;&oelig;",
                     u"\"").replace(u"\xe2&euro;",
                                    "\"").replace(u"\"&ldquo;", "-")
             #.replace(u"\xe2\x80\x9c", u"\"").replace(u"\xe2\x80\x9d", u"\"").replace(u"\xc3\xb3", u"\u00f3").replace(u"\xc3\xad", u"\u00ed").replace(u"\xe2\x20\xac\x21\x22", u"\'").replace(u"\xe2\x20\xac\x01\x53", u"\"").replace(u"\xe2\x20\xac", u"\"").replace(u"\xe2\x20\xac\x20\x1c", u" - ").replace(u"\xc3", u"\u00e9").replace(u"\x00\xc2", u" ")
             print >> fptr, decoded.encode('utf8')
     parser = etree.HTMLParser(target=Parser())
     with open(str(uid) + ".html", 'r') as fptr:
         data = fptr.read()
         parser.feed(data.decode('utf8'))
     return parser.close()