def __init__(self): CharSetGroupProber.__init__(self) self._mProbers = [ \ UTF8Prober(), SJISProber(), EUCJPProber(), GB2312Prober(), EUCKRProber(), Big5Prober(), EUCTWProber()] self.reset()
def __init__(self): CharSetGroupProber.__init__(self) self._mProbers = [ \ UTF8Prober(), SJISProber(), EUCJPProber(), GB18030Prober(), CP949Prober(), Big5Prober(), EUCTWProber()] self.reset()
def feed(self, aBuf): if isinstance(aBuf, unicode): self.result = {'encoding': "unicode", 'confidence': 1.0} self.done = constants. True return if self.done: return aLen = len(aBuf) if not aLen: return if not self._mGotData: # If the data starts with BOM, we know it is UTF if aBuf[:3] == '\xEF\xBB\xBF': # EF BB BF UTF-8 with BOM self.result = {'encoding': "utf_8", 'confidence': 1.0} elif aBuf[:4] in ('\xFF\xFE\x00\x00', '\x00\x00\xFE\xFF', '\xFE\xFF\x00\x00', '\x00\x00\xFF\xFE') or \ aBuf[:2] in ('\xFF\xFE', '\xFE\xFF'): self.result = {'encoding': "utf_n", 'confidence': 1.0} self._mGotData = constants. True if self.result['encoding'] and (self.result['confidence'] > 0.0): self.done = constants. True return if self._mInputState == ePureAscii: if self._highBitDetector.search(aBuf): self._mInputState = eHighbyte elif self._escDetector.search(self._mLastChar + aBuf): self._mInputState = eEscAscii self._mLastChar = aBuf[-1] if self._mInputState == eEscAscii: self.result = {'encoding': "escaped", 'confidence': 1.0} self.done = constants. True elif self._mInputState == eHighbyte: if not self._mCharSetProbers: self._mCharSetProbers = [UTF8Prober(), Latin1Prober()] for prober in self._mCharSetProbers: if prober.feed(aBuf) == constants.eFoundIt: self.result = { 'encoding': prober.get_charset_name(), 'confidence': prober.get_confidence() } self.done = constants. True break