def _encodeTagNum(self, tagnum): res = bytearray() exceptions.validate( tagnum <= limits.MAX_TAGS, u'Too many tags. The limit is %d' % limits.MAX_TAGS) res.append((tagnum & 0xFF00) >> 8) res.append(tagnum & 0x00FF) return res
def serialize(self): res = bytearray() res.extend(self._serializeSeparatorsList()) dfasNum = len(self.options2DFA) exceptions.validate(dfasNum > 0 and dfasNum < 256, u'Too many segmentation rules variants') res.append(dfasNum) for key, dfa in self.options2DFA.items(): optionsMap = self._key2Options(key) res.extend(self._serializeOptionsMap(optionsMap)) res.extend(self._serializeDFA(dfa)) res.extend(self._serializeOptionsMap(self.defaultOptions)) logging.info('segmentation rules size: %s bytes', len(res)) # logging.info([int(x) for x in res]) return res
def transitionsData2bytearray(self, state): res = bytearray() # logging.debug('next') for (segnum, shiftOrth), nextState in sorted(state.transitionsMap.items()): res.append(segnum) if shiftOrth: res.append(1) else: res.append(0) offset = nextState.offset exceptions.validate(offset <= MAX_FSA_SIZE, u'Segmentation rules are too big and complicated' \ + u'- the resulting automaton would exceed its max size which is %d' \ % MAX_FSA_SIZE) res.extend(htons(offset)) return res
def _readNamesAndQualifiers(inputFiles): names = set(['']) qualifiers = set([frozenset()]) lineParser = convertinput.LineParser() for line in _concatFiles(inputFiles): line = line.strip() if not lineParser.ignoreLine(line): _, _, _, name, qualifier = lineParser.parseLine(line) names.add(name) qualifiers.add(convertinput.parseQualifiers(qualifier)) namesMap = dict([(name, idx) for idx, name in enumerate(sorted(list(names)))]) qualifiersMap = dict([(quals, idx) for idx, quals in enumerate( sorted(qualifiers, key=lambda q: tuple(sorted(q))))]) exceptions.validate( len(qualifiersMap) <= limits.MAX_QUALIFIERS_COMBINATIONS, 'Too many qualifiers combinations. The limit is %d' % limits.MAX_QUALIFIERS_COMBINATIONS) return namesMap, qualifiersMap
def _encodeNameNum(self, namenum): exceptions.validate( namenum <= limits.MAX_NAMES, u'Too many named entity types. The limit is %d' % limits.MAX_NAMES) return bytearray([namenum])
def _encodeTypeNum(self, typenum): exceptions.validate( typenum <= limits.MAX_SEGMENT_TYPES, u'Too many segment types. The limit is %d' % limits.MAX_SEGMENT_TYPES) return bytearray([typenum])
def _transitions2ListBytes(self, state, originalState=None): res = bytearray() transitions = self.getSortedTransitions(state) thisIdx = self.state2Index[ originalState if originalState is not None else state] logging.debug('state ' + str(state.offset)) if len(transitions) == 0: assert state.isAccepting() return bytearray() else: stateAfterThis = self.statesTable[thisIdx + 1] for reversedN, (label, nextState) in enumerate(reversed(transitions)): transitionBytes = bytearray() assert nextState.reverseOffset is not None assert stateAfterThis.reverseOffset is not None logging.debug('next state reverse: ' + str(nextState.reverseOffset)) logging.debug('after state reverse: ' + str(stateAfterThis.reverseOffset)) # firstByte = label n = len(transitions) - reversedN hasShortLabel = label in self.label2ShortLabel firstByte = self.label2ShortLabel[label] if hasShortLabel else 0 firstByte <<= 2 last = len(transitions) == n isNext = last and stateAfterThis == nextState offsetSize = 0 # offset = 0 offset = (stateAfterThis.reverseOffset - nextState.reverseOffset) + len(res) assert offset > 0 or isNext if offset > 0: offsetSize += 1 if offset >= 256: offsetSize += 1 if offset >= 256 * 256: offsetSize += 1 exceptions.validate( offset < 256 * 256 * 256, u'Cannot build the automaton - it would exceed its max size which is %d' % (256 * 256 * 256)) # assert offset < 256 * 256 * 256 # TODO - przerobic na jakis porzadny wyjatek assert offsetSize <= 3 firstByte |= offsetSize transitionBytes.append(firstByte) if not hasShortLabel: transitionBytes.append(label) # serialize offset in big-endian order if offsetSize == 3: transitionBytes.append((offset & 0xFF0000) >> 16) if offsetSize >= 2: transitionBytes.append((offset & 0x00FF00) >> 8) if offsetSize >= 1: transitionBytes.append(offset & 0x0000FF) for b in reversed(transitionBytes): res.insert(0, b) logging.debug('inserted transition at beginning ' + chr(label) + ' -> ' + str(offset)) return res
def _readDictIdAndCopyright(inputFiles): dictId = None copyright = None for inputFile in inputFiles: if inputFile: with codecs.open(inputFile, 'r', 'utf8') as f: inCopyright = False for linenum, line in enumerate(f, start=1): if dictId is None and line.startswith(u'#!DICT-ID'): dictIdTag, _, dictId = line.strip().partition(u' ') exceptions.validate( dictIdTag == u'#!DICT-ID', 'Dictionary ID tag must be followed by a space character and dictionary ID string' ) exceptions.validate( len(line.split(u' ')) > 1, '%s:%d: Must provide DICT-ID' % (inputFile, linenum)) exceptions.validate( len(line.split(u' ')) == 2, '%s:%d: DICT-ID must not contain spaces' % (inputFile, linenum)) elif copyright is None and line.startswith( u'#<COPYRIGHT>'): exceptions.validate( line.strip() == u'#<COPYRIGHT>', '%s:%d: Copyright start tag must be the only one in the line' % (inputFile, linenum)) inCopyright = True copyright = '' elif line.startswith('#</COPYRIGHT>'): exceptions.validate( inCopyright, '%s:%d: Copyright end tag must be preceded by copyright start tag' % (inputFile, linenum)) exceptions.validate( line.strip() == u'#</COPYRIGHT>', '%s:%d: Copyright end tag must be the only one in the line' % (inputFile, linenum)) inCopyright = False elif inCopyright: copyright += line if dictId is None: logging.warn('No dictionary ID tag found') dictId = '' if copyright is None: logging.warn('No copyright info found') copyright = '' return (dictId, copyright)