예제 #1
0
파일: encode.py 프로젝트: djstrong/morfeusz
 def _encodeTagNum(self, tagnum):
     res = bytearray()
     exceptions.validate(
         tagnum <= limits.MAX_TAGS,
         u'Too many tags. The limit is %d' % limits.MAX_TAGS)
     res.append((tagnum & 0xFF00) >> 8)
     res.append(tagnum & 0x00FF)
     return res
예제 #2
0
 def serialize(self):
     res = bytearray()
     res.extend(self._serializeSeparatorsList())
     dfasNum = len(self.options2DFA)
     exceptions.validate(dfasNum > 0 and dfasNum < 256,
                         u'Too many segmentation rules variants')
     res.append(dfasNum)
     for key, dfa in self.options2DFA.items():
         optionsMap = self._key2Options(key)
         res.extend(self._serializeOptionsMap(optionsMap))
         res.extend(self._serializeDFA(dfa))
     res.extend(self._serializeOptionsMap(self.defaultOptions))
     logging.info('segmentation rules size: %s bytes', len(res))
     #         logging.info([int(x) for x in res])
     return res
예제 #3
0
    def transitionsData2bytearray(self, state):
        res = bytearray()
#         logging.debug('next')
        for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()):
            res.append(segnum)
            if shiftOrth:
                res.append(1)
            else:
                res.append(0)
            offset = nextState.offset
            exceptions.validate(offset <= MAX_FSA_SIZE,
                                u'Segmentation rules are too big and complicated' \
                                + u'- the resulting automaton would exceed its max size which is %d' \
                                % MAX_FSA_SIZE)
            res.extend(htons(offset))
        return res
예제 #4
0
def _readNamesAndQualifiers(inputFiles):
    names = set([''])
    qualifiers = set([frozenset()])
    lineParser = convertinput.LineParser()
    for line in _concatFiles(inputFiles):
        line = line.strip()
        if not lineParser.ignoreLine(line):
            _, _, _, name, qualifier = lineParser.parseLine(line)
            names.add(name)
            qualifiers.add(convertinput.parseQualifiers(qualifier))
    namesMap = dict([(name, idx)
                     for idx, name in enumerate(sorted(list(names)))])
    qualifiersMap = dict([(quals, idx) for idx, quals in enumerate(
        sorted(qualifiers, key=lambda q: tuple(sorted(q))))])
    exceptions.validate(
        len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS,
        'Too many qualifiers combinations. The limit is %d' %
        limits.MAX_QUALIFIERS_COMBINATIONS)

    return namesMap, qualifiersMap
예제 #5
0
파일: encode.py 프로젝트: djstrong/morfeusz
 def _encodeNameNum(self, namenum):
     exceptions.validate(
         namenum <= limits.MAX_NAMES,
         u'Too many named entity types. The limit is %d' % limits.MAX_NAMES)
     return bytearray([namenum])
예제 #6
0
파일: encode.py 프로젝트: djstrong/morfeusz
 def _encodeTypeNum(self, typenum):
     exceptions.validate(
         typenum <= limits.MAX_SEGMENT_TYPES,
         u'Too many segment types. The limit is %d' %
         limits.MAX_SEGMENT_TYPES)
     return bytearray([typenum])
예제 #7
0
    def _transitions2ListBytes(self, state, originalState=None):
        res = bytearray()
        transitions = self.getSortedTransitions(state)
        thisIdx = self.state2Index[
            originalState if originalState is not None else state]
        logging.debug('state ' + str(state.offset))
        if len(transitions) == 0:
            assert state.isAccepting()
            return bytearray()
        else:
            stateAfterThis = self.statesTable[thisIdx + 1]
            for reversedN, (label,
                            nextState) in enumerate(reversed(transitions)):
                transitionBytes = bytearray()
                assert nextState.reverseOffset is not None
                assert stateAfterThis.reverseOffset is not None
                logging.debug('next state reverse: ' +
                              str(nextState.reverseOffset))
                logging.debug('after state reverse: ' +
                              str(stateAfterThis.reverseOffset))

                #                 firstByte = label

                n = len(transitions) - reversedN
                hasShortLabel = label in self.label2ShortLabel
                firstByte = self.label2ShortLabel[label] if hasShortLabel else 0
                firstByte <<= 2

                last = len(transitions) == n
                isNext = last and stateAfterThis == nextState
                offsetSize = 0
                #                 offset = 0
                offset = (stateAfterThis.reverseOffset -
                          nextState.reverseOffset) + len(res)
                assert offset > 0 or isNext
                if offset > 0:
                    offsetSize += 1
                if offset >= 256:
                    offsetSize += 1
                if offset >= 256 * 256:
                    offsetSize += 1
                exceptions.validate(
                    offset < 256 * 256 * 256,
                    u'Cannot build the automaton - it would exceed its max size which is %d'
                    % (256 * 256 * 256))
                #                 assert offset < 256 * 256 * 256  # TODO - przerobic na jakis porzadny wyjatek
                assert offsetSize <= 3
                firstByte |= offsetSize

                transitionBytes.append(firstByte)
                if not hasShortLabel:
                    transitionBytes.append(label)
                # serialize offset in big-endian order
                if offsetSize == 3:
                    transitionBytes.append((offset & 0xFF0000) >> 16)
                if offsetSize >= 2:
                    transitionBytes.append((offset & 0x00FF00) >> 8)
                if offsetSize >= 1:
                    transitionBytes.append(offset & 0x0000FF)
                for b in reversed(transitionBytes):
                    res.insert(0, b)
                logging.debug('inserted transition at beginning ' +
                              chr(label) + ' -> ' + str(offset))

        return res
예제 #8
0
def _readDictIdAndCopyright(inputFiles):
    dictId = None
    copyright = None
    for inputFile in inputFiles:
        if inputFile:
            with codecs.open(inputFile, 'r', 'utf8') as f:
                inCopyright = False
                for linenum, line in enumerate(f, start=1):
                    if dictId is None and line.startswith(u'#!DICT-ID'):
                        dictIdTag, _, dictId = line.strip().partition(u' ')
                        exceptions.validate(
                            dictIdTag == u'#!DICT-ID',
                            'Dictionary ID tag must be followed by a space character and dictionary ID string'
                        )
                        exceptions.validate(
                            len(line.split(u' ')) > 1,
                            '%s:%d: Must provide DICT-ID' %
                            (inputFile, linenum))
                        exceptions.validate(
                            len(line.split(u' ')) == 2,
                            '%s:%d: DICT-ID must not contain spaces' %
                            (inputFile, linenum))
                    elif copyright is None and line.startswith(
                            u'#<COPYRIGHT>'):
                        exceptions.validate(
                            line.strip() == u'#<COPYRIGHT>',
                            '%s:%d: Copyright start tag must be the only one in the line'
                            % (inputFile, linenum))

                        inCopyright = True
                        copyright = ''

                    elif line.startswith('#</COPYRIGHT>'):

                        exceptions.validate(
                            inCopyright,
                            '%s:%d: Copyright end tag must be preceded by copyright start tag'
                            % (inputFile, linenum))

                        exceptions.validate(
                            line.strip() == u'#</COPYRIGHT>',
                            '%s:%d: Copyright end tag must be the only one in the line'
                            % (inputFile, linenum))

                        inCopyright = False

                    elif inCopyright:

                        copyright += line

    if dictId is None:
        logging.warn('No dictionary ID tag found')
        dictId = ''

    if copyright is None:
        logging.warn('No copyright info found')
        copyright = ''

    return (dictId, copyright)