def get_order(self, aStr): # for euc-JP encoding, we are interested # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that if aStr[0] >= _bytechar(0xA0): return 94 * (_byteord(aStr[0]) - 0xA1) + _byteord(aStr[1]) - 0xa1 else: return -1
def get_order(self, aStr): # for euc-TW encoding, we are interested # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that if aStr[0] >= _bytechar(0xC4): return 94 * (_byteord(aStr[0]) - 0xC4) + _byteord(aStr[1]) - 0xA1 else: return -1
def get_order(self, aStr): # for GB2312 encoding, we are interested # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that if (aStr[0] >= _bytechar(0xB0)) and (aStr[1] >= _bytechar(0xA1)): return 94 * (_byteord(aStr[0]) - 0xB0) + _byteord(aStr[1]) - 0xA1 else: return -1;
def get_order(self, aStr): # for big5 encoding, we are interested # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # no validation needed here. State machine has done that if aStr[0] >= _bytechar(0xA4): if aStr[1] >= _bytechar(0xA1): return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0xA1 + 63 else: return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0x40 else: return -1
def get_order(self, aStr): # for big5 encoding, we are interested # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # no validation needed here. State machine has done that if aStr[0] >= _bytechar(0xA4): if aStr[1] >= _bytechar(0xA1): return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord( aStr[1]) - 0xA1 + 63 else: return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord( aStr[1]) - 0x40 else: return -1
def get_order(self, aStr): # for sjis encoding, we are interested # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # no validation needed here. State machine has done that if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)): order = 188 * (_byteord(aStr[0]) - 0x81) elif (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xEF)): order = 188 * (_byteord(aStr[0]) - 0xE0 + 31) else: return -1 order = order + _byteord(aStr[1]) - 0x40 if aStr[1] > _bytechar(0x7F): order = -1 return order
def get_order(self, aStr): # for sjis encoding, we are interested # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # no validation needed here. State machine has done that if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)): order = 188 * (_byteord(aStr[0]) - 0x81) elif (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xEF)): order = 188 * (_byteord(aStr[0]) - 0xE0 + 31) else: return -1; order = order + _byteord(aStr[1]) - 0x40 if aStr[1] > _bytechar(0x7F): order = -1 return order
def next_state(self, c): # for each byte we get its class # if it is first byte, we also get byte length try: byteCls = self._mModel['classTable'][_byteord(c)] except IndexError: return eError if self._mCurrentState == eStart: self._mCurrentBytePos = 0 self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] # from byte's class and stateTable, we get its next state self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls] self._mCurrentBytePos += 1 return self._mCurrentState
def next_state(self, c): # for each byte we get its class # if it is first byte, we also get byte length try: byteCls = self._mModel['classTable'][_byteord(c)] except IndexError: return eError if self._mCurrentState == eStart: self._mCurrentBytePos = 0 self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] # from byte's class and stateTable, we get its next state self._mCurrentState = self._mModel['stateTable'][ self._mCurrentState * self._mModel['classFactor'] + byteCls] self._mCurrentBytePos += 1 return self._mCurrentState
def feed(self, aBuf): aBuf = self.filter_with_english_letters(aBuf) for c in aBuf: try: charClass = Latin1_CharToClass[_byteord(c)] except IndexError: return constants.eError freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass] if freq == 0: self._mState = constants.eNotMe break self._mFreqCounter[freq] += 1 self._mLastCharClass = charClass return self.get_state()
def get_order(self, aStr): if not aStr: return -1, 1 # find out current char's byte length try: if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)) or \ (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xFC)): charLen = 2 else: charLen = 1 except UnicodeDecodeError: return -1, 1 # return its order if it is hiragana if len(aStr) > 1: if (aStr[0] == _bytechar(202)) and \ (_bytechar(0x9F) <= aStr[1] <= _bytechar(0xF1)): return _byteord(aStr[1]) - 0x9F, charLen return -1, charLen
def feed(self, aBuf): if not self._mModel['keepEnglishLetter']: aBuf = self.filter_without_english_letters(aBuf) aLen = len(aBuf) if not aLen: return self.get_state() for c in aBuf: try: order = self._mModel['charToOrderMap'][_byteord(c)] except IndexError: return constants.eError if order < SYMBOL_CAT_ORDER: self._mTotalChar += 1 if order < SAMPLE_SIZE: self._mFreqChar += 1 if self._mLastOrder < SAMPLE_SIZE: self._mTotalSeqs += 1 if not self._mReversed: self._mSeqCounters[self._mModel['precedenceMatrix'][ (self._mLastOrder * SAMPLE_SIZE) + order]] += 1 else: # reverse the order of the letters in the lookup self._mSeqCounters[self._mModel['precedenceMatrix'][ (order * SAMPLE_SIZE) + self._mLastOrder]] += 1 self._mLastOrder = order if self.get_state() == constants.eDetecting: if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD: cf = self.get_confidence() if cf > POSITIVE_SHORTCUT_THRESHOLD: if constants._debug: sys.stderr.write( '%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf)) self._mState = constants.eFoundIt elif cf < NEGATIVE_SHORTCUT_THRESHOLD: if constants._debug: sys.stderr.write( '%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD)) self._mState = constants.eNotMe return self.get_state()
def get_order(self, aStr): if not aStr: return -1, 1 # find out current char's byte length try: if (aStr[0] == _bytechar(0x8E)) or \ (_bytechar(0xA1) <= aStr[0] <= _bytechar(0xFE)): charLen = 2 elif aStr[0] == _bytechar(0x8F): charLen = 3 else: charLen = 1 except UnicodeDecodeError: return -1, 1 # return its order if it is hiragana if len(aStr) > 1: if (aStr[0] == _bytechar(0xA4)) and \ (_bytechar(0xA1) <= aStr[1] <= _bytechar(0xF3)): return _byteord(aStr[1]) - 0xA1, charLen return -1, charLen
def feed(self, aBuf): if not self._mModel['keepEnglishLetter']: aBuf = self.filter_without_english_letters(aBuf) aLen = len(aBuf) if not aLen: return self.get_state() for c in aBuf: try: order = self._mModel['charToOrderMap'][_byteord(c)] except IndexError: return constants.eError if order < SYMBOL_CAT_ORDER: self._mTotalChar += 1 if order < SAMPLE_SIZE: self._mFreqChar += 1 if self._mLastOrder < SAMPLE_SIZE: self._mTotalSeqs += 1 if not self._mReversed: self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1 else: # reverse the order of the letters in the lookup self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1 self._mLastOrder = order if self.get_state() == constants.eDetecting: if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD: cf = self.get_confidence() if cf > POSITIVE_SHORTCUT_THRESHOLD: if constants._debug: sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf)) self._mState = constants.eFoundIt elif cf < NEGATIVE_SHORTCUT_THRESHOLD: if constants._debug: sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD)) self._mState = constants.eNotMe return self.get_state()