def page_changed(fp: str, new_content: str) -> Tuple[bool, str, str]: """ Compares hash of html file content to new file content. Returns: bool: whether change str: new content (formatted) str: encoding code """ # read binary of existing html and use it to get probable encoding try: with open(fp, "rb") as file: file_bytes = file.read() orig_encoding = chardet.detect(file_bytes)["encoding"] # read existing html using the above encoding and generate a hash with open(fp, encoding=orig_encoding) as file: file_content = UnicodeDammit(file.read()).unicode_markup old_hash = hashlib.sha256( file_content.encode(orig_encoding)).hexdigest() # generate hash from new html formatted_new_content = UnicodeDammit(new_content).unicode_markup new_hash = hashlib.sha256( formatted_new_content.encode(orig_encoding)).hexdigest() print(old_hash, new_hash) return old_hash != new_hash, formatted_new_content, orig_encoding except FileNotFoundError: return True, UnicodeDammit(new_content).unicode_markup, "utf8"
def getURL(self, url, uid): if not os.path.isfile(str(uid) + ".html"): with open(str(uid) + ".html", "w") as fptr: try: response = self.opener.open(url) except: return None data = response.read() try: decoded = UnicodeDammit( gzip.GzipFile(fileobj=io.BytesIO(data)).read(), ["windows-1252"], smart_quotes_to="html").unicode_markup except: decoded = UnicodeDammit( data, ["windows-1252"], smart_quotes_to="html").unicode_markup decoded = decoded.replace(u"%20", u" ").replace( u"\u00c2", u" ").replace(u"\xe2€™", u"\'").replace( u"\xe2€œ", u"\"").replace(u"\xe2€", "\"").replace(u"\"“", "-") #.replace(u"\xe2\x80\x9c", u"\"").replace(u"\xe2\x80\x9d", u"\"").replace(u"\xc3\xb3", u"\u00f3").replace(u"\xc3\xad", u"\u00ed").replace(u"\xe2\x20\xac\x21\x22", u"\'").replace(u"\xe2\x20\xac\x01\x53", u"\"").replace(u"\xe2\x20\xac", u"\"").replace(u"\xe2\x20\xac\x20\x1c", u" - ").replace(u"\xc3", u"\u00e9").replace(u"\x00\xc2", u" ") print >> fptr, decoded.encode('utf8') parser = etree.HTMLParser(target=Parser()) with open(str(uid) + ".html", 'r') as fptr: data = fptr.read() parser.feed(data.decode('utf8')) return parser.close()