Пример #1
0
    def to_unicode(self, file_content):

        encoding_info = chardet.detect(file_content)
        logging.info("[Sender] file encoding detection result: %s",
                     encoding_info)

        if not encoding_info or "encoding" not in encoding_info:
            logging.info(
                "[Sender] failed to detect file encoding, fall back to default encoding"
            )
            return file_content.decode(_DEFAULT_ENCODING_ASSUMPTION).encode(
                "utf-8")

        encoding = encoding_info.get("encoding").lower()

        if encoding.startswith("utf"):
            logging.info("[Sender] file in utf already, no need to convert")
            return file_content

        for parent_encoding, child_encodings in _ENCODING_MAP.iteritems():
            if encoding in child_encodings:
                logging.info("[Sender] switch to parent encoding %s",
                             parent_encoding)
                return file_content.decode(parent_encoding).encode("utf-8")

        try:
            return file_content.decode(encoding).encode("utf-8")
        except Exception:
            logging.error(traceback.format_exc())
            logging.info(
                "[Sender] failed to convert from %s to utf-8, fall back to default coding",
                encoding)
            return file_content.decode(_DEFAULT_ENCODING_ASSUMPTION).encode(
                "utf-8")
Пример #2
0
def makeUnicode(string,encoding='utf-8'):
	if isinstance(string,unicode): return string
	try:
		ret = unicode.decode(string,encoding)
		if DEBUG: LOG('util.makeUnicode(): Successfully used: %s' % encoding)
		return ret
	except:
		detected_encoding = chardet.detect(string)
		if DEBUG: LOG('util.makeUnicode(): Detected encoding: %s' % detected_encoding)
		try:
			string = unicode(string,detected_encoding['encoding'])
			return string
		except:
			return unicode(string,encoding,'replace')
Пример #3
0
    def unicode(self, string, encoding=None):
        if not encoding:
            encoding = self._encoding
        else:
            encoding = "utf-8"

        try:
            return string.decode(encoding).encode("utf-8")
        except:
            detected_encoding = chardet.detect(string)
            try:
                string = string.decode(detected_encoding["encoding"])
                self.updateEncoding(detected_encoding["encoding"], detected_encoding["confidence"])
                return string.encode("utf-8")
            except:
                pass
        return string.encode("utf-8", "replace")
Пример #4
0
 def detect_with_chardet(self):
     len_limit = 500 * 1024
     dct = {}
     if self.txt_byte:
         dct = chardet.detect(self.txt_byte[:len_limit])
     else:
         f = open(self.filename, 'r')
         detector = universaldetector.UniversalDetector()
         numbytes = 0
         for line_byte in f.xreadlines():
             detector.feed(line_byte)
             numbytes += len(line_byte)
             if detector.done: break
             if numbytes > len_limit: break  # don't read forever
         detector.close()
         dct = detector.result
     return dct.get('encoding')
Пример #5
0
 def detect_with_chardet(self):
     len_limit = 500*1024
     dct = {}
     if self.txt_byte:
         dct = chardet.detect(self.txt_byte[:len_limit])
     else:
         f = open(self.filename, 'r')
         detector = universaldetector.UniversalDetector()
         numbytes = 0
         for line_byte in f.xreadlines():
             detector.feed(line_byte)
             numbytes += len(line_byte)
             if detector.done: break
             if numbytes > len_limit: break # don't read forever
         detector.close()
         dct = detector.result
     return dct.get('encoding')
Пример #6
0
    def txt_to_html(self, input_txt, output):
        """
        Convert txt string to file with name "output".
        """
        charset = chardet.detect(input_txt)
        temp = input_txt.decode(charset['encoding'])
        #temp = temp.replace('&', '&')
        #temp = temp.replace('<', '&lt;')
        #temp = temp.replace('>', '&gt;')
        #temp = temp.replace('"', '&quot;')
        #temp = temp.replace("'", '&#187;')

        paragraphs = temp.split('\n')
        html_template = get_template("epuber-templates/html-template.xhtml")
        html_context = template.Context({'paragraphs': paragraphs})
        
        output_file = open(output, 'w')
        output_file.write(html_template.render(html_context).encode("utf-8"))
        output_file.close()
Пример #7
0
def convertHTMLCodes(html,FB=None,encoding=None):
	if not encoding:
		if FB:
			encoding = FB.getEncoding()
		else:
			encoding = 'utf-8'
	try:
		html = html.decode(encoding).encode('utf-8')
		return html
	except:
		detected_encoding = chardet.detect(html)
		if util.DEBUG: util.LOG(detected_encoding)
		try:
			html = html.decode(detected_encoding['encoding']).encode('utf-8')
			if FB: FB.updateEncoding(detected_encoding['encoding'],detected_encoding['confidence'])
		except:
			html = html.encode('utf-8','replace')
			
	try:
		html = re.sub('&#(\d{1,5});',cUConvert,html)
		html = re.sub('&(\w+?);',cTConvert,html)
	except:
		pass
	return html
Пример #8
0
def getTitleLink(link):
    title = u''
    des = u''
    try:
        response = webfetch(link, follow_redirects=True, deadline=30, validate_certificate=False)
    except:
        return None,None

    if response == None:
        return None,None

    content = response.content
    uri = urlparse.urlparse(link)

    loc = uri[1]
    if short(loc):
       link = response.final_url or link
       if not link.startswith('http') :
           return None,None
       if linkitem.all().filter('url =',link).filter('valid =',0).get() != None :
           return None,None

    # filter url like "aaa.com", no site url, focus article
    uri = urlparse.urlparse(link)

    if uri[1] in filtersite :
       return None,None
    if uri[4]  == '':
       if uri[2]  == '':
          return None,None
       if uri[2]  == '/':
          return None,None
       if uri[2]  == '#':
          return None,None
 
    title = re.search('<title>([\S\s]*?)</title>', content, re.IGNORECASE)
    if title == None:
        return None,None
    title = title.group(1)

    if title=='':
        title=link
    if is_snake(title) :  #or is_snake(content):
        url = linkitem()
        url.valid = 0 
        url.url = link
        url.put()
        return None,None
    else:
        url = linkitem()
        url.valid = 1 
        url.url = link
        url.put()
    
    charset = 'utf-8'
    if 'charset' in response.headers:
       charset1 = response.headers['charset']
    else:
       charset1 = 'utf-8'
    
    if len(content) >= (1 * 1024 * 1024):
        return title, link

    try: 
        content = content.decode(charset1, 'ignore')
        charset = charset1
    except:
        try: 
           m_charset = re.search('<meta\s*http-equiv="?Content-Type"? content="text/html;\s*charset=([\w\d-]+?)"', content, re.IGNORECASE)
           charset2 = m_charset.group(1)
           charset = charset2
        except:
           try:
              charset3 = chardet.detect(content)['encoding']
              content = content.decode(charset3,'ignore')
              charset = charset3
           except:
              try:
                   content = content.decode("utf-8",'ignore')
                   charset = 'utf-8'
              except:
                   return None,None
 
    title = title.decode(charset)

    art = article()
    art.title = title
    art.txt = content
    art.url = link
    art.put()

    return title, link
Пример #9
0
def getTitleLink(link):
    title = u""
    des = u""
    try:
        response = webfetch(link, follow_redirects=True, deadline=30, validate_certificate=False)
    except:
        return None, None

    if response == None:
        return None, None

    content = response.content
    uri = urlparse.urlparse(link)

    loc = uri[1]
    if short(loc):
        link = response.final_url or link
        if not link.startswith("http"):
            return None, None
        if linkitem.all().filter("url =", link).filter("valid =", 0).get() != None:
            return None, None

    # filter url like "aaa.com", no site url, focus article
    uri = urlparse.urlparse(link)

    if uri[1] in filtersite:
        return None, None
    if uri[4] == "":
        if uri[2] == "":
            return None, None
        if uri[2] == "/":
            return None, None
        if uri[2] == "#":
            return None, None

    title = re.search("<title>([\S\s]*?)</title>", content, re.IGNORECASE)
    if title == None:
        return None, None
    title = title.group(1)

    if title == "":
        return None, None
    if title.startswith("Page Not Found"):
        return None, None
    if title.startswith("Oops! Something Bad Happened"):
        return None, None
    if is_snake(title):  # or is_snake(content):
        url = linkitem()
        url.valid = 0
        url.url = link
        url.put()
        return None, None
    else:
        url = linkitem()
        url.valid = 1
        url.url = link
        url.put()

    charset = "utf-8"
    if "charset" in response.headers:
        charset1 = response.headers["charset"]
    else:
        charset1 = "utf-8"

    if len(content) >= (1 * 1024 * 1024):
        return title, link

    try:
        content = content.decode(charset1, "ignore")
        charset = charset1
    except:
        try:
            m_charset = re.search(
                '<meta\s*http-equiv="?Content-Type"? content="text/html;\s*charset=([\w\d-]+?)"', content, re.IGNORECASE
            )
            charset2 = m_charset.group(1)
            charset = charset2
        except:
            try:
                charset3 = chardet.detect(content)["encoding"]
                content = content.decode(charset3, "ignore")
                charset = charset3
            except:
                try:
                    content = content.decode("utf-8", "ignore")
                    charset = "utf-8"
                except:
                    return None, None

    title = title.decode(charset)

    art = article()
    art.title = title
    art.txt = content
    art.url = link
    art.put()

    return title, link