示例#1
0
def page_changed(fp: str, new_content: str) -> Tuple[bool, str, str]:
    """
    Compares hash of html file content to new file content.
    Returns:
        bool: whether change
        str: new content (formatted)
        str: encoding code
    """
    # read binary of existing html and use it to get probable encoding
    try:
        with open(fp, "rb") as file:
            file_bytes = file.read()
        orig_encoding = chardet.detect(file_bytes)["encoding"]

        # read existing html using the above encoding and generate a hash
        with open(fp, encoding=orig_encoding) as file:
            file_content = UnicodeDammit(file.read()).unicode_markup
        old_hash = hashlib.sha256(
            file_content.encode(orig_encoding)).hexdigest()

        # generate hash from new html
        formatted_new_content = UnicodeDammit(new_content).unicode_markup
        new_hash = hashlib.sha256(
            formatted_new_content.encode(orig_encoding)).hexdigest()

        print(old_hash, new_hash)
        return old_hash != new_hash, formatted_new_content, orig_encoding

    except FileNotFoundError:
        return True, UnicodeDammit(new_content).unicode_markup, "utf8"
示例#2
0
 def getURL(self, url, uid):
     if not os.path.isfile(str(uid) + ".html"):
         with open(str(uid) + ".html", "w") as fptr:
             try:
                 response = self.opener.open(url)
             except:
                 return None
             data = response.read()
             try:
                 decoded = UnicodeDammit(
                     gzip.GzipFile(fileobj=io.BytesIO(data)).read(),
                     ["windows-1252"],
                     smart_quotes_to="html").unicode_markup
             except:
                 decoded = UnicodeDammit(
                     data, ["windows-1252"],
                     smart_quotes_to="html").unicode_markup
             decoded = decoded.replace(u"%20", u" ").replace(
                 u"\u00c2",
                 u" ").replace(u"\xe2€™", u"\'").replace(
                     u"\xe2€œ",
                     u"\"").replace(u"\xe2€",
                                    "\"").replace(u"\"“", "-")
             #.replace(u"\xe2\x80\x9c", u"\"").replace(u"\xe2\x80\x9d", u"\"").replace(u"\xc3\xb3", u"\u00f3").replace(u"\xc3\xad", u"\u00ed").replace(u"\xe2\x20\xac\x21\x22", u"\'").replace(u"\xe2\x20\xac\x01\x53", u"\"").replace(u"\xe2\x20\xac", u"\"").replace(u"\xe2\x20\xac\x20\x1c", u" - ").replace(u"\xc3", u"\u00e9").replace(u"\x00\xc2", u" ")
             print >> fptr, decoded.encode('utf8')
     parser = etree.HTMLParser(target=Parser())
     with open(str(uid) + ".html", 'r') as fptr:
         data = fptr.read()
         parser.feed(data.decode('utf8'))
     return parser.close()