def __init__(self, filename=None, encoding='utf8'): self.tagsetId = None self.tag2tagnum = {} #~ self._name2namenum = {} if filename: self._doInit(filename, encoding) self._tagnum2tag = dict(map(_a(lambda k, v: (v, k)), self.tag2tagnum.items()))
def serializeFSAPrologue(self): res = bytearray() # labels sorted by popularity sortedLabels = [ label for (label, freq) in sorted(self.fsa.label2Freq.items(), key=_a(lambda label, freq: (-freq, label))) ] # popular labels table self.label2ShortLabel = dict([(label, sortedLabels.index(label) + 1) for label in sortedLabels[:63]]) logging.debug( dict([(chr(label), shortLabel) for label, shortLabel in self.label2ShortLabel.items()])) # write remaining short labels (zeros) for label in range(256): res.append(self.label2ShortLabel.get(label, 0)) # write a magic char before initial state res.append(ord('^')) return res
def serializeQualifiersMap(self): label2labelId = dict([ (u'|'.join(qualifiers), n) for qualifiers, n in sorted(self.qualifiersMap.items(), key=_a(lambda qs, n: n)) ]) return self._serializeTags(label2labelId)
def _serializeTags(self, tagsMap): res = bytearray() numOfTags = len(tagsMap) res.extend(htons(numOfTags)) for tag, tagnum in sorted(tagsMap.items(), key=_a(lambda tag, tagnum: tagnum)): res.extend(htons(tagnum)) res.extend(self.fsa.encodeWord(tag)) res.append(0) return res
def getSortedTransitions(self, state): return sorted(state.transitionsMap.items(), key=_a(lambda label, nextState: (-state.label2Freq.get( label, 0), -self.fsa.label2Freq.get(label, 0))))