def to_unicode(self, file_content): encoding_info = chardet.detect(file_content) logging.info("[Sender] file encoding detection result: %s", encoding_info) if not encoding_info or "encoding" not in encoding_info: logging.info( "[Sender] failed to detect file encoding, fall back to default encoding" ) return file_content.decode(_DEFAULT_ENCODING_ASSUMPTION).encode( "utf-8") encoding = encoding_info.get("encoding").lower() if encoding.startswith("utf"): logging.info("[Sender] file in utf already, no need to convert") return file_content for parent_encoding, child_encodings in _ENCODING_MAP.iteritems(): if encoding in child_encodings: logging.info("[Sender] switch to parent encoding %s", parent_encoding) return file_content.decode(parent_encoding).encode("utf-8") try: return file_content.decode(encoding).encode("utf-8") except Exception: logging.error(traceback.format_exc()) logging.info( "[Sender] failed to convert from %s to utf-8, fall back to default coding", encoding) return file_content.decode(_DEFAULT_ENCODING_ASSUMPTION).encode( "utf-8")
def makeUnicode(string,encoding='utf-8'): if isinstance(string,unicode): return string try: ret = unicode.decode(string,encoding) if DEBUG: LOG('util.makeUnicode(): Successfully used: %s' % encoding) return ret except: detected_encoding = chardet.detect(string) if DEBUG: LOG('util.makeUnicode(): Detected encoding: %s' % detected_encoding) try: string = unicode(string,detected_encoding['encoding']) return string except: return unicode(string,encoding,'replace')
def unicode(self, string, encoding=None): if not encoding: encoding = self._encoding else: encoding = "utf-8" try: return string.decode(encoding).encode("utf-8") except: detected_encoding = chardet.detect(string) try: string = string.decode(detected_encoding["encoding"]) self.updateEncoding(detected_encoding["encoding"], detected_encoding["confidence"]) return string.encode("utf-8") except: pass return string.encode("utf-8", "replace")
def detect_with_chardet(self): len_limit = 500 * 1024 dct = {} if self.txt_byte: dct = chardet.detect(self.txt_byte[:len_limit]) else: f = open(self.filename, 'r') detector = universaldetector.UniversalDetector() numbytes = 0 for line_byte in f.xreadlines(): detector.feed(line_byte) numbytes += len(line_byte) if detector.done: break if numbytes > len_limit: break # don't read forever detector.close() dct = detector.result return dct.get('encoding')
def detect_with_chardet(self): len_limit = 500*1024 dct = {} if self.txt_byte: dct = chardet.detect(self.txt_byte[:len_limit]) else: f = open(self.filename, 'r') detector = universaldetector.UniversalDetector() numbytes = 0 for line_byte in f.xreadlines(): detector.feed(line_byte) numbytes += len(line_byte) if detector.done: break if numbytes > len_limit: break # don't read forever detector.close() dct = detector.result return dct.get('encoding')
def txt_to_html(self, input_txt, output): """ Convert txt string to file with name "output". """ charset = chardet.detect(input_txt) temp = input_txt.decode(charset['encoding']) #temp = temp.replace('&', '&') #temp = temp.replace('<', '<') #temp = temp.replace('>', '>') #temp = temp.replace('"', '"') #temp = temp.replace("'", '»') paragraphs = temp.split('\n') html_template = get_template("epuber-templates/html-template.xhtml") html_context = template.Context({'paragraphs': paragraphs}) output_file = open(output, 'w') output_file.write(html_template.render(html_context).encode("utf-8")) output_file.close()
def convertHTMLCodes(html,FB=None,encoding=None): if not encoding: if FB: encoding = FB.getEncoding() else: encoding = 'utf-8' try: html = html.decode(encoding).encode('utf-8') return html except: detected_encoding = chardet.detect(html) if util.DEBUG: util.LOG(detected_encoding) try: html = html.decode(detected_encoding['encoding']).encode('utf-8') if FB: FB.updateEncoding(detected_encoding['encoding'],detected_encoding['confidence']) except: html = html.encode('utf-8','replace') try: html = re.sub('&#(\d{1,5});',cUConvert,html) html = re.sub('&(\w+?);',cTConvert,html) except: pass return html
def getTitleLink(link): title = u'' des = u'' try: response = webfetch(link, follow_redirects=True, deadline=30, validate_certificate=False) except: return None,None if response == None: return None,None content = response.content uri = urlparse.urlparse(link) loc = uri[1] if short(loc): link = response.final_url or link if not link.startswith('http') : return None,None if linkitem.all().filter('url =',link).filter('valid =',0).get() != None : return None,None # filter url like "aaa.com", no site url, focus article uri = urlparse.urlparse(link) if uri[1] in filtersite : return None,None if uri[4] == '': if uri[2] == '': return None,None if uri[2] == '/': return None,None if uri[2] == '#': return None,None title = re.search('<title>([\S\s]*?)</title>', content, re.IGNORECASE) if title == None: return None,None title = title.group(1) if title=='': title=link if is_snake(title) : #or is_snake(content): url = linkitem() url.valid = 0 url.url = link url.put() return None,None else: url = linkitem() url.valid = 1 url.url = link url.put() charset = 'utf-8' if 'charset' in response.headers: charset1 = response.headers['charset'] else: charset1 = 'utf-8' if len(content) >= (1 * 1024 * 1024): return title, link try: content = content.decode(charset1, 'ignore') charset = charset1 except: try: m_charset = re.search('<meta\s*http-equiv="?Content-Type"? content="text/html;\s*charset=([\w\d-]+?)"', content, re.IGNORECASE) charset2 = m_charset.group(1) charset = charset2 except: try: charset3 = chardet.detect(content)['encoding'] content = content.decode(charset3,'ignore') charset = charset3 except: try: content = content.decode("utf-8",'ignore') charset = 'utf-8' except: return None,None title = title.decode(charset) art = article() art.title = title art.txt = content art.url = link art.put() return title, link
def getTitleLink(link): title = u"" des = u"" try: response = webfetch(link, follow_redirects=True, deadline=30, validate_certificate=False) except: return None, None if response == None: return None, None content = response.content uri = urlparse.urlparse(link) loc = uri[1] if short(loc): link = response.final_url or link if not link.startswith("http"): return None, None if linkitem.all().filter("url =", link).filter("valid =", 0).get() != None: return None, None # filter url like "aaa.com", no site url, focus article uri = urlparse.urlparse(link) if uri[1] in filtersite: return None, None if uri[4] == "": if uri[2] == "": return None, None if uri[2] == "/": return None, None if uri[2] == "#": return None, None title = re.search("<title>([\S\s]*?)</title>", content, re.IGNORECASE) if title == None: return None, None title = title.group(1) if title == "": return None, None if title.startswith("Page Not Found"): return None, None if title.startswith("Oops! Something Bad Happened"): return None, None if is_snake(title): # or is_snake(content): url = linkitem() url.valid = 0 url.url = link url.put() return None, None else: url = linkitem() url.valid = 1 url.url = link url.put() charset = "utf-8" if "charset" in response.headers: charset1 = response.headers["charset"] else: charset1 = "utf-8" if len(content) >= (1 * 1024 * 1024): return title, link try: content = content.decode(charset1, "ignore") charset = charset1 except: try: m_charset = re.search( '<meta\s*http-equiv="?Content-Type"? content="text/html;\s*charset=([\w\d-]+?)"', content, re.IGNORECASE ) charset2 = m_charset.group(1) charset = charset2 except: try: charset3 = chardet.detect(content)["encoding"] content = content.decode(charset3, "ignore") charset = charset3 except: try: content = content.decode("utf-8", "ignore") charset = "utf-8" except: return None, None title = title.decode(charset) art = article() art.title = title art.txt = content art.url = link art.put() return title, link