Пример #1
0
 def get_order(self, aStr):
     # for euc-JP encoding, we are interested
     #   first  byte range: 0xa0 -- 0xfe
     #   second byte range: 0xa1 -- 0xfe
     # no validation needed here. State machine has done that
     if aStr[0] >= _bytechar(0xA0):
         return 94 * (_byteord(aStr[0]) - 0xA1) + _byteord(aStr[1]) - 0xa1
     else:
         return -1
Пример #2
0
 def get_order(self, aStr):
     # for euc-TW encoding, we are interested 
     #   first  byte range: 0xc4 -- 0xfe
     #   second byte range: 0xa1 -- 0xfe
     # no validation needed here. State machine has done that
     if aStr[0] >= _bytechar(0xC4):
         return 94 * (_byteord(aStr[0]) - 0xC4) + _byteord(aStr[1]) - 0xA1
     else:
         return -1
Пример #3
0
 def get_order(self, aStr):
     # for GB2312 encoding, we are interested 
     #  first  byte range: 0xb0 -- 0xfe
     #  second byte range: 0xa1 -- 0xfe
     # no validation needed here. State machine has done that
     if (aStr[0] >= _bytechar(0xB0)) and (aStr[1] >= _bytechar(0xA1)):
         return 94 * (_byteord(aStr[0]) - 0xB0) + _byteord(aStr[1]) - 0xA1
     else:
         return -1;
Пример #4
0
 def get_order(self, aStr):
     # for big5 encoding, we are interested 
     #   first  byte range: 0xa4 -- 0xfe
     #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
     # no validation needed here. State machine has done that
     if aStr[0] >= _bytechar(0xA4):
         if aStr[1] >= _bytechar(0xA1):
             return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0xA1 + 63
         else:
             return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0x40
     else:
         return -1
Пример #5
0
 def get_order(self, aStr):
     # for big5 encoding, we are interested
     #   first  byte range: 0xa4 -- 0xfe
     #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
     # no validation needed here. State machine has done that
     if aStr[0] >= _bytechar(0xA4):
         if aStr[1] >= _bytechar(0xA1):
             return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(
                 aStr[1]) - 0xA1 + 63
         else:
             return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(
                 aStr[1]) - 0x40
     else:
         return -1
Пример #6
0
 def get_order(self, aStr):
     # for sjis encoding, we are interested
     #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
     #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
     # no validation needed here. State machine has done that
     if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)):
         order = 188 * (_byteord(aStr[0]) - 0x81)
     elif (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xEF)):
         order = 188 * (_byteord(aStr[0]) - 0xE0 + 31)
     else:
         return -1
     order = order + _byteord(aStr[1]) - 0x40
     if aStr[1] > _bytechar(0x7F):
         order = -1
     return order
Пример #7
0
 def get_order(self, aStr):
     # for sjis encoding, we are interested 
     #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
     #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
     # no validation needed here. State machine has done that
     if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)):
         order = 188 * (_byteord(aStr[0]) - 0x81)
     elif (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xEF)):
         order = 188 * (_byteord(aStr[0]) - 0xE0 + 31)
     else:
         return -1;
     order = order + _byteord(aStr[1]) - 0x40
     if aStr[1] > _bytechar(0x7F):
         order = -1
     return order
Пример #8
0
 def next_state(self, c):
     # for each byte we get its class
     # if it is first byte, we also get byte length
     try:
         byteCls = self._mModel['classTable'][_byteord(c)]
     except IndexError:
         return eError
     if self._mCurrentState == eStart:
         self._mCurrentBytePos = 0
         self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
     # from byte's class and stateTable, we get its next state
     self._mCurrentState = self._mModel['stateTable'][self._mCurrentState * self._mModel['classFactor'] + byteCls]
     self._mCurrentBytePos += 1
     return self._mCurrentState
Пример #9
0
 def next_state(self, c):
     # for each byte we get its class
     # if it is first byte, we also get byte length
     try:
         byteCls = self._mModel['classTable'][_byteord(c)]
     except IndexError:
         return eError
     if self._mCurrentState == eStart:
         self._mCurrentBytePos = 0
         self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
     # from byte's class and stateTable, we get its next state
     self._mCurrentState = self._mModel['stateTable'][
         self._mCurrentState * self._mModel['classFactor'] + byteCls]
     self._mCurrentBytePos += 1
     return self._mCurrentState
Пример #10
0
    def feed(self, aBuf):
        aBuf = self.filter_with_english_letters(aBuf)
        for c in aBuf:
            try:
                charClass = Latin1_CharToClass[_byteord(c)]
            except IndexError:
                return constants.eError
            freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass]
            if freq == 0:
                self._mState = constants.eNotMe
                break
            self._mFreqCounter[freq] += 1
            self._mLastCharClass = charClass

        return self.get_state()
Пример #11
0
    def feed(self, aBuf):
        aBuf = self.filter_with_english_letters(aBuf)
        for c in aBuf:
            try:
                charClass = Latin1_CharToClass[_byteord(c)]
            except IndexError:
                return constants.eError
            freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) +
                                    charClass]
            if freq == 0:
                self._mState = constants.eNotMe
                break
            self._mFreqCounter[freq] += 1
            self._mLastCharClass = charClass

        return self.get_state()
Пример #12
0
    def get_order(self, aStr):
        if not aStr: return -1, 1
        # find out current char's byte length
        try:
            if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)) or \
               (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xFC)):
                charLen = 2
            else:
                charLen = 1
        except UnicodeDecodeError:
            return -1, 1

        # return its order if it is hiragana
        if len(aStr) > 1:
            if (aStr[0] == _bytechar(202)) and \
               (_bytechar(0x9F) <= aStr[1] <= _bytechar(0xF1)):
                return _byteord(aStr[1]) - 0x9F, charLen

        return -1, charLen
Пример #13
0
    def feed(self, aBuf):
        if not self._mModel['keepEnglishLetter']:
            aBuf = self.filter_without_english_letters(aBuf)
        aLen = len(aBuf)
        if not aLen:
            return self.get_state()
        for c in aBuf:
            try:
                order = self._mModel['charToOrderMap'][_byteord(c)]
            except IndexError:
                return constants.eError
            if order < SYMBOL_CAT_ORDER:
                self._mTotalChar += 1
            if order < SAMPLE_SIZE:
                self._mFreqChar += 1
                if self._mLastOrder < SAMPLE_SIZE:
                    self._mTotalSeqs += 1
                    if not self._mReversed:
                        self._mSeqCounters[self._mModel['precedenceMatrix'][
                            (self._mLastOrder * SAMPLE_SIZE) + order]] += 1
                    else:  # reverse the order of the letters in the lookup
                        self._mSeqCounters[self._mModel['precedenceMatrix'][
                            (order * SAMPLE_SIZE) + self._mLastOrder]] += 1
            self._mLastOrder = order

        if self.get_state() == constants.eDetecting:
            if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD:
                cf = self.get_confidence()
                if cf > POSITIVE_SHORTCUT_THRESHOLD:
                    if constants._debug:
                        sys.stderr.write(
                            '%s confidence = %s, we have a winner\n' %
                            (self._mModel['charsetName'], cf))
                    self._mState = constants.eFoundIt
                elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
                    if constants._debug:
                        sys.stderr.write(
                            '%s confidence = %s, below negative shortcut threshhold %s\n'
                            % (self._mModel['charsetName'], cf,
                               NEGATIVE_SHORTCUT_THRESHOLD))
                    self._mState = constants.eNotMe

        return self.get_state()
Пример #14
0
    def get_order(self, aStr):
        if not aStr: return -1, 1
        # find out current char's byte length
        try:
            if (aStr[0] == _bytechar(0x8E)) or \
               (_bytechar(0xA1) <= aStr[0] <= _bytechar(0xFE)):
                charLen = 2
            elif aStr[0] == _bytechar(0x8F):
                charLen = 3
            else:
                charLen = 1
        except UnicodeDecodeError:
            return -1, 1

        # return its order if it is hiragana
        if len(aStr) > 1:
            if (aStr[0] == _bytechar(0xA4)) and \
               (_bytechar(0xA1) <= aStr[1] <= _bytechar(0xF3)):
                return _byteord(aStr[1]) - 0xA1, charLen

        return -1, charLen
Пример #15
0
    def feed(self, aBuf):
        if not self._mModel['keepEnglishLetter']:
            aBuf = self.filter_without_english_letters(aBuf)
        aLen = len(aBuf)
        if not aLen:
            return self.get_state()
        for c in aBuf:
            try:
                order = self._mModel['charToOrderMap'][_byteord(c)]
            except IndexError:
                return constants.eError
            if order < SYMBOL_CAT_ORDER:
                self._mTotalChar += 1
            if order < SAMPLE_SIZE:
                self._mFreqChar += 1
                if self._mLastOrder < SAMPLE_SIZE:
                    self._mTotalSeqs += 1
                    if not self._mReversed:
                        self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1
                    else: # reverse the order of the letters in the lookup
                        self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1
            self._mLastOrder = order

        if self.get_state() == constants.eDetecting:
            if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD:
                cf = self.get_confidence()
                if cf > POSITIVE_SHORTCUT_THRESHOLD:
                    if constants._debug:
                        sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf))
                    self._mState = constants.eFoundIt
                elif cf < NEGATIVE_SHORTCUT_THRESHOLD:
                    if constants._debug:
                        sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD))
                    self._mState = constants.eNotMe

        return self.get_state()