def __init__(self, language, characterDomain=None, databaseUrl=None, dbConnectInst=None, ignoreIllegalSettings=False, **options): dbConnectInst = dbConnectInst or getDBConnector( getDatabaseConfiguration(databaseUrl)) locale = self.LANGUAGE_CHAR_LOCALE_MAPPING[language] CharacterLookup.__init__(self, locale, characterDomain or 'Unicode', dbConnectInst=dbConnectInst) self.language = language # choose a better character domain if non specified if (characterDomain and characterDomain not in self.LANGUAGE_CHAR_DOMAIN_MAPPING[ self.language]): if ignoreIllegalSettings: characterDomain = None else: raise ValueError( "Illegal character domain '%s' for language '%s'" % (characterDomain, self.language)) if not characterDomain: self.setCharacterDomain(self._getCharacterDomain()) if locale != 'T': self._characterLookupTraditional = CharacterLookup('T', dbConnectInst=self.db)
def divideIntoSections(self): ''' same as lyricsParser.divideIntoSections just class variable name self.listSyllable is different converts mandarin to pinyin divides into sections ''' currSectionLyrics = [] for syl in self.listSyllables: isEndOfSentence, syl.text = stripPunctuationSings(syl.text) ### convert from mandarin to pinyin if not syl.text == 'REST': cjk = CharacterLookup('C') textPinYinList = cjk.getReadingForCharacter(syl.text, 'Pinyin', toneMarkType='none') if len(textPinYinList) > 1: self.logger.warn("converted syllable {} has {} parts".format(textPinYinList, len(textPinYinList))) syl.text = textPinYinList[0] # take only first variant of pinyin interpretations ### finish up sentence when punctuation present if isEndOfSentence: currSectionLyrics.append(syl) self.listSentences.append(currSectionLyrics) currSectionLyrics = [] else: currSectionLyrics.append(syl)
def mandarinToPinyin(mandarinChar): cjk = CharacterLookup('C') textPinYinList = cjk.getReadingForCharacter(mandarinChar, 'Pinyin', toneMarkType='none') if len(textPinYinList) > 1: print "converted syllable {} has {} parts".format( textPinYinList, len(textPinYinList)) pinyin = textPinYinList[ 0] # take only first variant of pinyin interpretations return pinyin
def _checkStrokeOrderFromDecomposition(self, decomposition, index=0): """Goes through a decomposition""" if type(decomposition[index]) != type(()): # IDS operator character = decomposition[index] missingChars = [] hasFullOrder = True if CharacterLookup.isBinaryIDSOperator(character): # check for IDS operators we can't make any order # assumption about if character not in self.ALLOWED_COMPONENT_STRUCTURE: return False, index, [] else: # Get stroke order for both components for _ in range(0, 2): fullOrder, index, missing \ = self._checkStrokeOrderFromDecomposition( decomposition, index+1) if not fullOrder: missingChars.extend(missing) hasFullOrder = hasFullOrder and fullOrder elif CharacterLookup.isTrinaryIDSOperator(character): # Get stroke order for three components for _ in range(0, 3): fullOrder, index, missing \ = self._checkStrokeOrderFromDecomposition( decomposition, index+1) if not fullOrder: missingChars.extend(missing) hasFullOrder = hasFullOrder and fullOrder else: assert False, 'not an IDS character' return hasFullOrder, index, missingChars else: # no IDS operator but character char, glyph = decomposition[index] # if the character is unknown or there is none raise if char == u'?': return False, index, [] else: # recursion fullOrder, missingChars = self.checkStrokeOrder(char, glyph) if not fullOrder and not missingChars: missingChars = [char] return fullOrder, index, missingChars assert False
def consumeComponent(decomposition): """ Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香} consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}. """ if type(decomposition[0]) == type(()): # consume one component return decomposition[1:] if CharacterLookup.isBinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) return consumeComponent(decomposition) elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) decomposition = consumeComponent(decomposition) return consumeComponent(decomposition)
def decompositionFromString(decomposition): # taken from CharacterLookup, but adapted to return None if no glyph # given componentsList = [] index = 0 while index < len(decomposition): char = decomposition[index] if CharacterLookup.isIDSOperator(char): componentsList.append(char) else: # is Chinese character # Special handling for surrogate pairs on UCS-2 systems if util.isValidSurrogate(decomposition[index:index + 2]): char = decomposition[index:index + 2] # A surrogate pair now index += 1 # Bypass trailing surrogate if char == '#': # pseudo character, find digit end offset = 2 while index+offset < len(decomposition) \ and decomposition[index+offset].isdigit(): offset += 1 char = int(decomposition[index:index + offset]) charGlyph = 0 elif index+1 < len(decomposition)\ and decomposition[index+1] == '[': # extract glyph information endIndex = decomposition.index(']', index + 1) charGlyph = int(decomposition[index + 2:endIndex]) index = endIndex else: charGlyph = None componentsList.append((char, charGlyph)) index = index + 1 return componentsList
def get_cat_code(s): char = unicode(s)[0] cjk = CharacterLookup("C") readings = cjk.getReadingForCharacter(char, "Pinyin") if not readings: # Not Chinese, just use first character as code return char.upper() # It's very hard to determine which reading is correct for our case, # so don't bother to check it, just use the first one and let users to fix # it if it is incorrect reading = readings[0] # We use the first letter as code return reading[0].upper()
def decompositionFromString(decomposition): # taken from CharacterLookup, but adapted to return None if no glyph # given componentsList = [] index = 0 while index < len(decomposition): char = decomposition[index] if CharacterLookup.isIDSOperator(char): componentsList.append(char) else: # is Chinese character # Special handling for surrogate pairs on UCS-2 systems if util.isValidSurrogate(decomposition[index : index + 2]): char = decomposition[index : index + 2] # A surrogate pair now index += 1 # Bypass trailing surrogate if char == "#": # pseudo character, find digit end offset = 2 while index + offset < len(decomposition) and decomposition[index + offset].isdigit(): offset += 1 char = int(decomposition[index : index + offset]) charGlyph = 0 elif index + 1 < len(decomposition) and decomposition[index + 1] == "[": # extract glyph information endIndex = decomposition.index("]", index + 1) charGlyph = int(decomposition[index + 2 : endIndex]) index = endIndex else: charGlyph = None componentsList.append((char, charGlyph)) index = index + 1 return componentsList
def run(self): decompositionEntries, flagEntries = self.read() # Remove pseudo characters by merging entries if not self.includePseudoCharacters: decompositionEntries, flagEntries = self._removePseudoCharacters(decompositionEntries, flagEntries) # Remove minimal component entries if not self.includeMinimal: for char in sorted(decompositionEntries.keys()): for glyph in decompositionEntries[char]: for decomposition in decompositionEntries[char][glyph].copy(): if len(decomposition) == 1: decompositionEntries[char][glyph].remove(decomposition) del flagEntries[char][glyph][decomposition] # Merge similar decompositions, removing inferior ones self._mergeSimilarDecompositions(decompositionEntries, flagEntries) # Write entries for char in sorted(decompositionEntries.keys()): for glyph in decompositionEntries[char]: for idx, decomposition in enumerate(sorted(decompositionEntries[char][glyph])): decompStr = CharacterLookup.decompositionToString(decomposition) if type(char) == type(0): # pseudo character char = "#%d" % char flagStr = "".join(sorted(flagEntries[char][glyph][decomposition])) print( '"%(char)s","%(decomp)s",%(glyph)d,%(index)d,%(flags)s' % {"char": char, "decomp": decompStr, "glyph": glyph, "index": idx, "flags": flagStr} ).encode(default_encoding)
def getDecomposition(structure): # add glyph information decomposition = [] for c in structure: if type(c) == type(u'') and CharacterLookup.isIDSOperator(c): decomposition.append(c) else: decomposition.append((c, 0)) return decomposition
def getDecomposition(structure): # add glyph information decomposition = [] for c in structure: if type(c) == type(u"") and CharacterLookup.isIDSOperator(c): decomposition.append(c) else: decomposition.append((c, 0)) return decomposition
class Mapper(object): def __init__(self, variant='T'): self.characterLookup = CharacterLookup('T') self.variant = variant def mapEntry(self, char, reading): entries = [] for var in self.characterLookup.getCharacterVariants( char, self.variant): entries.append((var, reading)) return entries
def tokenize(input, output): try: text = open(input, 'r').readlines() except IOError: print "IOError: could not open", input sys.exit() cjk = CharacterLookup('T') out = open(output, 'w') for line in text: line = line.decode('utf-8') new_line = "" for char in line: pinyin = cjk.getReadingForCharacter(char, 'Pinyin') if pinyin: new_line += char new_line += '\n' out.write(new_line.encode('utf-8')) out.close()
class Mapper(object): def __init__(self, variant='T'): self.characterLookup = CharacterLookup('T') self.variant = variant def mapEntry(self, char, reading): entries = [] for var in self.characterLookup.getCharacterVariants(char, self.variant): entries.append((var, reading)) return entries
class GlyphIterator(object): def __init__(self): self._cjk = CharacterLookup('T', 'Unicode') self.characterIterator = self._cjk.getDomainCharacterIterator() self.curChar = None self.glyphQueue = [] def __iter__(self): return self def next(self): while not self.glyphQueue: self.curChar = self.characterIterator.next() try: glyphs = self._cjk.getCharacterGlyphs(self.curChar) self.glyphQueue.extend(glyphs) except exception.NoInformationError: pass return '%s/%d' % (self.curChar, self.glyphQueue.pop())
def to_pinyin(filename): try: input = open(filename, 'r').readlines() except IOError: print "IOError: could not open", filename sys.exit() cjk = CharacterLookup('T') input = [u'我喜歡他'] for line in input: #line = line.decode('utf-8') new_line = "" for char in line: pinyin = cjk.getReadingForCharacter(char, 'Pinyin') if pinyin: print [unidecode(x) for x in pinyin] simplified = unidecode(pinyin[0]) new_line += simplified + char + " " line = new_line print line
def parseIDS(decomposition, index): if index >= len(decomposition): raise ValueError() if type(decomposition[index]) == type(()): # consume one component return index + 1 if not CharacterLookup.isIDSOperator(decomposition[index]): # simple chars should be IDS operators raise ValueError() if CharacterLookup.isBinaryIDSOperator(decomposition[index]): index = index + 1 index = parseIDS(decomposition, index) return parseIDS(decomposition, index) elif CharacterLookup.isTrinaryIDSOperator(decomposition[index]): index = index + 1 index = parseIDS(decomposition, index) index = parseIDS(decomposition, index) return parseIDS(decomposition, index) else: raise ValueError()
def getStrokeOrd(fin, kl): """ Trying for awareness of glyph locale in lookup. """ from cjklib.characterlookup import CharacterLookup for i in kl: if i in cedict.simplified: cjk = CharacterLookup('C') elif i in cedict.traditional: cjk = CharacterLookup('T') else: cjk = CharacterLookup('J') j = cjk.getStrokeOrder(i) fin.append(u'• ' + u' '.join(j)) return fin
def auxSOrd(i): """ Try to get stroke decomposition if subcomponent decomposition fails. """ from cjklib.characterlookup import CharacterLookup if i in cedict.simplified: cjk = CharacterLookup('C') elif i in cedict.traditional: cjk = CharacterLookup('T') else: cjk = CharacterLookup('J') try: j = cjk.getStrokeOrder(i) except: return u'[x]' return u' '.join(j)
def next(self): entry = self._getNextEntry() if entry is None: raise StopIteration() else: char, decompString = entry # TODO support CHISE private character entries # remove CHISE private character entries decompString = re.sub("&[^;]+;", u'?', decompString) decomposition = [] for c in decompString: if CharacterLookup.isIDSOperator(c): decomposition.append(c) else: decomposition.append((c, 0)) # flag 'C'HISE return (char, 0, decomposition, set('C'))
def next(self): entry = self._getNextEntry() if entry is None: raise StopIteration() else: char, decompString = entry # TODO support CHISE private character entries # remove CHISE private character entries decompString = re.sub("&[^;]+;", u"?", decompString) decomposition = [] for c in decompString: if CharacterLookup.isIDSOperator(c): decomposition.append(c) else: decomposition.append((c, 0)) # flag 'C'HISE return (char, 0, decomposition, set("C"))
def run(self): decompositionEntries, flagEntries = self.read() # Remove pseudo characters by merging entries if not self.includePseudoCharacters: decompositionEntries, flagEntries = self._removePseudoCharacters( decompositionEntries, flagEntries) # Remove minimal component entries if not self.includeMinimal: for char in sorted(decompositionEntries.keys()): for glyph in decompositionEntries[char]: for decomposition \ in decompositionEntries[char][glyph].copy(): if len(decomposition) == 1: decompositionEntries[char][glyph].remove( decomposition) del flagEntries[char][glyph][decomposition] # Merge similar decompositions, removing inferior ones self._mergeSimilarDecompositions(decompositionEntries, flagEntries) # Write entries for char in sorted(decompositionEntries.keys()): for glyph in decompositionEntries[char]: for idx, decomposition in enumerate( sorted(decompositionEntries[char][glyph])): decompStr = CharacterLookup.decompositionToString( decomposition) if type(char) == type(0): # pseudo character char = '#%d' % char flagStr = ''.join( sorted(flagEntries[char][glyph][decomposition])) print( '"%(char)s","%(decomp)s",%(glyph)d,%(index)d,%(flags)s' % { 'char': char, 'decomp': decompStr, 'glyph': glyph, 'index': idx, 'flags': flagStr }).encode(default_encoding)
def next(self): if not hasattr(self, "_fileIterator"): if not self.quiet: print >>sys.stderr, "FILE: reading '%s'" % self.filePath fileHandle = codecs.open(self.filePath, "r", default_encoding) self._fileIterator = UnicodeCSVFileIterator(fileHandle) while True: char, decompString, glyph, _, flags = self._fileIterator.next() if len(char) > 1: # pseudo char if not char.startswith("#"): print >>sys.stderr, ("FILE: Error parsing entry '%s', %s" % (char, glyph)).encode(default_encoding) continue else: char = int(char[1:]) decomposition = CharacterLookup.decompositionFromString(decompString) return (char, int(glyph), decomposition, set(flags))
def next(self): if not hasattr(self, '_fileIterator'): if not self.quiet: print >> sys.stderr, "FILE: reading '%s'" % self.filePath fileHandle = codecs.open(self.filePath, 'r', default_encoding) self._fileIterator = UnicodeCSVFileIterator(fileHandle) while True: char, decompString, glyph, _, flags = self._fileIterator.next() if len(char) > 1: # pseudo char if not char.startswith('#'): print >> sys.stderr, ( "FILE: Error parsing entry '%s', %s" % (char, glyph)).encode(default_encoding) continue else: char = int(char[1:]) decomposition = CharacterLookup.decompositionFromString( decompString) return (char, int(glyph), decomposition, set(flags))
def _getDecompositionEntriesDict(cls): """ Gets the decomposition table from the database. @rtype: dict @return: dictionary with key pair character, I{glyph} and the first layer decomposition as value with the entry's flag """ decompDict = {} # get entries from database db = dbconnector.getDBConnector() table = db.tables['CharacterDecomposition'] result = db.selectRows(select([table.c.ChineseCharacter, table.c.Glyph, table.c.Decomposition, table.c.Flags])\ .order_by(table.c.SubIndex)) entries = [] for char, glyph, decompString, flags in result: decomposition = CharacterLookup.decompositionFromString( decompString) entries.append((char, glyph, decomposition, set(flags))) return entries
def _getDecompositionEntriesDict(cls): """ Gets the decomposition table from the database. @rtype: dict @return: dictionary with key pair character, I{glyph} and the first layer decomposition as value with the entry's flag """ decompDict = {} # get entries from database db = dbconnector.getDBConnector() table = db.tables["CharacterDecomposition"] result = db.selectRows( select([table.c.ChineseCharacter, table.c.Glyph, table.c.Decomposition, table.c.Flags]).order_by( table.c.SubIndex ) ) entries = [] for char, glyph, decompString, flags in result: decomposition = CharacterLookup.decompositionFromString(decompString) entries.append((char, glyph, decomposition, set(flags))) return entries
def __init__(self): self._cjk = CharacterLookup('T', 'Unicode') self.characterIterator = self._cjk.getDomainCharacterIterator() self.curChar = None self.glyphQueue = []
def getCharacters(self): cjk = CharacterLookup('T', self.title) return ' '.join(cjk.getDomainCharacterIterator())
#!/usr/bin/env python # -*- coding: utf-8 -*- import mica import cjklib from cjklib.dictionary import CEDICT from cjklib.characterlookup import CharacterLookup d = CEDICT() cjk = CharacterLookup('C') src = """ 小明五岁,他有一个哥哥,哥哥是学生。他爸爸妈妈都工作。小明说,他家一共五口人。 今天星期六,我们不上课。小王说,晚上有一个好电影,他和我一起去看,我很高兴。下午六点我去食堂吃饭,六点半去小王的宿舍,七点我们去看电影。 张丽英家有四口人:爸爸,妈妈,姐姐和她。她爸爸是大夫,五十七岁了,身体很好。他工作很忙,星期天常常不休息。妈妈是银行职员,今年五十岁。她姐姐是老师,今年二月结婚了。她不住在爸爸妈妈家。昨天是星期五,下午没有课。我们去她家了。她家在北京饭店旁边。我们到她家的时候,她爸爸妈妈不在家。我们和她一起谈话,听音乐,看电视。五点半张丽英的爸爸妈妈回家了。她姐姐也来了。我们在她家吃饭,晚上八点半我们就回学校了。 教学楼前边的自行车很多。田芳下课后要找自己的自行车。田芳的自行车是新的。张东问她,你的自行车是什么颜色的?田芳说是蓝的。张东说,那辆蓝车是不是你的?田芳说,我的自行车是新的,不是旧的,那辆车不是我的。忽然,田芳看见了自己的自行车,她说,啊,我的自行车在那儿呢,我找到了 """ def tryce(uni, fail_if_more_than_one = False) : count = 0 results = d.getFor(uni) trans = u'' last = None for e in results : if count > 0 and e[2].lower() == last[2].lower() : # print "Duplicate CEDICT pinyin!" count -= 1
def _mergeSimilarDecompositions(self, decompositionEntries, flagEntries): """ Merges two decompositions, if they are the same, except: - one has an unknown component while the other doesn't, - one has a subtree that is the decomposition of the corresponding component of the other decomposition. """ def consumeComponent(decomposition): """ Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香} consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}. """ if type(decomposition[0]) == type(()): # consume one component return decomposition[1:] if CharacterLookup.isBinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) return consumeComponent(decomposition) elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) decomposition = consumeComponent(decomposition) return consumeComponent(decomposition) def compareTrees(decompositionA, decompositionB): """ Checks for similar decomposition trees, taking care of unknown components. Returns C{None} if the trees are not equal, a integer if the trees are similar. If the left tree (decompositionA) should be preferred a negative number is returned, or a positive number for the right tree (decompositionB). If C{0} is returned, both trees are equally good to choose from. """ if not decompositionA and not decompositionB: # equal return 0 elif not decompositionA or not decompositionB: # if all preceding components are the same that shouldn't happen raise ValueError() elif decompositionA[0] == decompositionB[0]: return compareTrees(decompositionA[1:], decompositionB[1:]) elif (type(decompositionA[0]) == type(()) and decompositionA[0][0] == u'?'): decompositionB = consumeComponent(decompositionB) result = compareTrees(decompositionA[1:], decompositionB) if result is None or result < 0: # unequal or the left side is preferred later on return None else: return +1 elif (type(decompositionB[0]) == type(()) and decompositionB[0][0] == u'?'): decompositionA = consumeComponent(decompositionA) result = compareTrees(decompositionA, decompositionB[1:]) if result is None or result > 0: # unequal or the right side is preferred later on return None else: return -1 elif (CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0])): # No way these decompositions can be equal # (simplified subseq. checking) return None elif CharacterLookup.isIDSOperator(decompositionA[0]): # expand tree B char, glyph = decompositionB[0] if (char in decompositionEntries and glyph in decompositionEntries[char]): for decomposition in decompositionEntries[char][glyph]: result = compareTrees( decompositionA, decomposition + decompositionB[1:]) if result is not None and result >= 0: # right side preferred and so do we... # A shorted description is better return 1 return None elif CharacterLookup.isIDSOperator(decompositionB[0]): # expand tree A char, glyph = decompositionA[0] if (char in decompositionEntries and glyph in decompositionEntries[char]): for decomposition in decompositionEntries[char][glyph]: result = compareTrees( decomposition + decompositionA[1:], decompositionB) if result is not None and result <= 0: # left side preferred and so do we... # A shorted description is better return -1 return None else: return None for char in decompositionEntries: for glyph in decompositionEntries[char]: idxA = 0 decompositions = list(decompositionEntries[char][glyph]) flagsDict = flagEntries[char][glyph] # Check every decomposition with all others to the right while idxA < len(decompositions): idxB = idxA + 1 while idxB < len(decompositions): try: result = compareTrees(decompositions[idxA], decompositions[idxB]) if result is not None and result == 0: # Entries are equal, we can transfer flags flagsDict[decompositions[idxA]].update( flagsDict[decompositions[idxB]]) del flagsDict[decompositions[idxB]] del decompositions[idxB] elif result is not None and result < 0: del flagsDict[decompositions[idxB]] del decompositions[idxB] elif result is not None and result > 0: del flagsDict[decompositions[idxA]] del decompositions[idxA] # No need for further testing for this decomp break else: # Only increase if the list didn't shift to the # left idxB += 1 except ValueError: print >> sys.stderr, ( "Error comparing decompositions %s and %s" % (CharacterLookup.decompositionToString( decompositions[idxA]), CharacterLookup.decompositionToString( decompositions[idxB])))\ .encode(default_encoding) idxB += 1 else: idxA += 1 decompositionEntries[char][glyph] = set(decompositions)
def __init__(self, options, args): self._locale = options.locale self._characterDomain = options.characterDomain self._cjk = CharacterLookup(self._locale, self._characterDomain)
class StrokeChecker(object): ALLOWED_COMPONENT_STRUCTURE = [u'⿰', u'⿱', u'⿵', u'⿶', u'⿸', u'⿹', u'⿺', u'⿲', u'⿳'] """ Component structures that allow derivation of stroke order from components. """ MIN_COMPONENT_PRODUCTIVITY = 2 """ Min productivity when reporting out-domain components that could help boost the in-domain set. """ def __init__(self, options, args): self._locale = options.locale self._characterDomain = options.characterDomain self._cjk = CharacterLookup(self._locale, self._characterDomain) def run(self): charCount = 0 charFullCount = 0 missingCharsDict = {} missingSingleCharacters = [] # iterate through all characters of the character set for char in self._cjk.getDomainCharacterIterator(): #for char in iter([u'亄', u'乿', u'仜', u'伳']): # DEBUG charCount += 1 if charCount % 100 == 0: sys.stdout.write('.') sys.stdout.flush() hasFullOrder, missingChars = self.checkStrokeOrder(char) if hasFullOrder: charFullCount += 1 else: if missingChars: # list components that can help us build this transform. for missing in missingChars: if missing not in missingCharsDict: missingCharsDict[missing] = [] missingCharsDict[missing].append(char) else: missingSingleCharacters.append(char) sys.stdout.write('\n') output_encoding = sys.stdout.encoding or locale.getpreferredencoding() \ or 'ascii' print 'Total characters: %d' % charCount print 'Characters with full stroke data: %d (%d%%)' % (charFullCount, 100 * charFullCount / charCount) # missing single characters # Extend by those with components, that have a component with low # productivity. inDomainComponents = set( self._cjk.filterDomainCharacters(missingCharsDict.keys())) lowProductivityComponentChars = [] for component, chars in missingCharsDict.items(): if component not in inDomainComponents \ and len(chars) < self.MIN_COMPONENT_PRODUCTIVITY: lowProductivityComponentChars.extend(chars) del missingCharsDict[component] missingSingleCharacters.extend(lowProductivityComponentChars) print 'Missing single characters:', print ''.join(missingSingleCharacters).encode(output_encoding, 'replace') # remove characters that we already placed in "single" _missingSingleCharacters = set(missingSingleCharacters) for component, chars in missingCharsDict.items(): missingCharsDict[component] = list( set(chars) - _missingSingleCharacters) if not missingCharsDict[component]: del missingCharsDict[component] # missing components missingComponents = sorted(missingCharsDict.items(), key=lambda (x,y): len(y)) missingComponents.reverse() inDomainComponentList = [(component, chars) \ for component, chars in missingComponents \ if component in inDomainComponents] # only show "out-domain" components if they have productivity > 1 outDomainComponentList = [(component, chars) \ for component, chars in missingComponents \ if component not in inDomainComponents and len(chars) > 1] print 'Missing components: %d' % (len(inDomainComponentList) \ + len(outDomainComponentList)) print 'Missing in-domain components:', print ', '.join(['%s (%s)' % (component, ''.join(chars)) \ for component, chars in inDomainComponentList])\ .encode(output_encoding, 'replace') print 'Missing out-domain components:', print ', '.join(['%s (%s)' % (component, ''.join(chars)) \ for component, chars in outDomainComponentList])\ .encode(output_encoding, 'replace') def checkStrokeOrder(self, char, glyph=None): try: self._cjk.getStrokeOrder(char, glyph) return True, [] except NoInformationError: pass # add decompositions, limit to upper bound max_samples missingChars = [] decompositions = self._cjk.getDecompositionEntries(char, glyph) for decomposition in decompositions: hasFullOrder, _, missing = self._checkStrokeOrderFromDecomposition( decomposition) assert not hasFullOrder missingChars.extend(missing) return False, missingChars def _checkStrokeOrderFromDecomposition(self, decomposition, index=0): """Goes through a decomposition""" if type(decomposition[index]) != type(()): # IDS operator character = decomposition[index] missingChars = [] hasFullOrder = True if CharacterLookup.isBinaryIDSOperator(character): # check for IDS operators we can't make any order # assumption about if character not in self.ALLOWED_COMPONENT_STRUCTURE: return False, index, [] else: # Get stroke order for both components for _ in range(0, 2): fullOrder, index, missing \ = self._checkStrokeOrderFromDecomposition( decomposition, index+1) if not fullOrder: missingChars.extend(missing) hasFullOrder = hasFullOrder and fullOrder elif CharacterLookup.isTrinaryIDSOperator(character): # Get stroke order for three components for _ in range(0, 3): fullOrder, index, missing \ = self._checkStrokeOrderFromDecomposition( decomposition, index+1) if not fullOrder: missingChars.extend(missing) hasFullOrder = hasFullOrder and fullOrder else: assert False, 'not an IDS character' return hasFullOrder, index, missingChars else: # no IDS operator but character char, glyph = decomposition[index] # if the character is unknown or there is none raise if char == u'?': return False, index, [] else: # recursion fullOrder, missingChars = self.checkStrokeOrder(char, glyph) if not fullOrder and not missingChars: missingChars = [char] return fullOrder, index, missingChars assert False
def __init__(self, variant='T'): self.characterLookup = CharacterLookup('T') self.variant = variant
def main(): cjk = CharacterLookup('T') cjkSimplified = CharacterLookup('C') fileEntryCount = 0 databaseMissingEntryCount = 0 noEntryCount = 0 wrongEquivalentCount = 0 seenRadicalFormIndices = set() seenRadicalVariantIndices = set() for line in sys.stdin: line = line.decode(default_encoding) if re.match(r'\s*#', line) or re.match(r'\s+$', line): continue else: fileEntryCount = fileEntryCount + 1 matchObj = re.match(r"(\d{1,3})('?);\s+([1234567890ABCDEF]{4,5});" \ + r"\s+([1234567890ABCDEF]{4,5})\s*$", line) if matchObj: index, variant, radicalCP, equivalentCP = matchObj.groups() radicalIdx = int(index) radicalForm = chr(int(radicalCP, 16)) equivalentForm = chr(int(equivalentCP, 16)) if variant: seenRadicalVariantIndices.add(radicalIdx) else: seenRadicalFormIndices.add(radicalIdx) # check radicalForm if not variant: targetForms = set([cjk.getKangxiRadicalForm(radicalIdx)]) else: targetForms = set() # add simplified form, if different simplifiedForm = cjkSimplified.getKangxiRadicalForm( radicalIdx) if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx): targetForms.add(simplifiedForm) # add simplified variant targetForms.update( set(cjkSimplified.getKangxiRadicalVariantForms( radicalIdx)) \ - set(cjk.getKangxiRadicalVariantForms(radicalIdx))) if radicalForm not in targetForms: # cjklib is missing something print(("No entry for radical form '%s' with index %d%s" % (radicalForm, radicalIdx, variant))\ .encode(default_encoding)) databaseMissingEntryCount += 1 if targetForms - set([radicalForm]): # CJKRadicals.txt is missing something for form in targetForms - set([radicalForm]): print(("Database entry '%s' with radical index %d%s" \ % (form, radicalIdx, variant) \ + " not included in table")\ .encode(default_encoding)) noEntryCount += 1 # check equivalentForm libraryEquivalentForm \ = cjk.getRadicalFormEquivalentCharacter(radicalForm) if libraryEquivalentForm != equivalentForm: print(("Equivalent radical form '%s' with index %d%s" % (libraryEquivalentForm, radicalIdx, variant) \ + " not backed by table: '%s'" % equivalentForm)\ .encode(default_encoding)) wrongEquivalentCount += 1 else: print(("error reading line: '" + line + "'")\ .encode(default_encoding)) for radicalIdx in set(range(1, 215)) - seenRadicalFormIndices: print(("No table entry for radical index %d" % radicalIdx)\ .encode(default_encoding)) noEntryCount += 1 for radicalIdx in set(range(1, 215)) - seenRadicalVariantIndices: simplifiedForms = set() # add simplified form, if different simplifiedForm = cjkSimplified.getKangxiRadicalForm( radicalIdx) if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx): simplifiedForms.add(simplifiedForm) # add simplified variant simplifiedForms.update( set(cjkSimplified.getKangxiRadicalVariantForms( radicalIdx)) \ - set(cjk.getKangxiRadicalVariantForms(radicalIdx))) for form in simplifiedForms: print(("No table entry for simplified radical %s with index %d'" % (form, radicalIdx)).encode(default_encoding)) noEntryCount += 1 for radicalIdx in range(1, 215): otherVariants = set(cjk.getKangxiRadicalVariantForms(radicalIdx)) \ - set(cjkSimplified.getKangxiRadicalVariantForms(radicalIdx)) for form in otherVariants: print(("No table entry for variant %s with index %d'" % (form, radicalIdx)).encode(default_encoding)) noEntryCount += 1 print("Total %d entries" % fileEntryCount \ + ", %d missing from cjklib" % databaseMissingEntryCount \ + ", %d mismatches in equivalent forms" % wrongEquivalentCount \ + ", not found in source list: %d" % noEntryCount)
def main(): cjk = CharacterLookup('T') cjkSimplified = CharacterLookup('C') fileEntryCount = 0 databaseMissingEntryCount = 0 noEntryCount = 0 wrongEquivalentCount = 0 seenRadicalFormIndices = set() seenRadicalVariantIndices = set() for line in sys.stdin: line = line.decode(default_encoding) if re.match(r'\s*#', line) or re.match(r'\s+$', line): continue else: fileEntryCount = fileEntryCount + 1 matchObj = re.match(r"(\d{1,3})('?);\s+([1234567890ABCDEF]{4,5});" \ + r"\s+([1234567890ABCDEF]{4,5})\s*$", line) if matchObj: index, variant, radicalCP, equivalentCP = matchObj.groups() radicalIdx = int(index) radicalForm = unichr(int(radicalCP, 16)) equivalentForm = unichr(int(equivalentCP, 16)) if variant: seenRadicalVariantIndices.add(radicalIdx) else: seenRadicalFormIndices.add(radicalIdx) # check radicalForm if not variant: targetForms = set([cjk.getKangxiRadicalForm(radicalIdx)]) else: targetForms = set() # add simplified form, if different simplifiedForm = cjkSimplified.getKangxiRadicalForm( radicalIdx) if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx): targetForms.add(simplifiedForm) # add simplified variant targetForms.update( set(cjkSimplified.getKangxiRadicalVariantForms( radicalIdx)) \ - set(cjk.getKangxiRadicalVariantForms(radicalIdx))) if radicalForm not in targetForms: # cjklib is missing something print ("No entry for radical form '%s' with index %d%s" % (radicalForm, radicalIdx, variant))\ .encode(default_encoding) databaseMissingEntryCount += 1 if targetForms - set([radicalForm]): # CJKRadicals.txt is missing something for form in targetForms - set([radicalForm]): print ("Database entry '%s' with radical index %d%s" \ % (form, radicalIdx, variant) \ + " not included in table")\ .encode(default_encoding) noEntryCount += 1 # check equivalentForm libraryEquivalentForm \ = cjk.getRadicalFormEquivalentCharacter(radicalForm) if libraryEquivalentForm != equivalentForm: print ("Equivalent radical form '%s' with index %d%s" % (libraryEquivalentForm, radicalIdx, variant) \ + " not backed by table: '%s'" % equivalentForm)\ .encode(default_encoding) wrongEquivalentCount += 1 else: print ("error reading line: '" + line + "'")\ .encode(default_encoding) for radicalIdx in set(range(1, 215)) - seenRadicalFormIndices: print ("No table entry for radical index %d" % radicalIdx)\ .encode(default_encoding) noEntryCount += 1 for radicalIdx in set(range(1, 215)) - seenRadicalVariantIndices: simplifiedForms = set() # add simplified form, if different simplifiedForm = cjkSimplified.getKangxiRadicalForm( radicalIdx) if simplifiedForm != cjk.getKangxiRadicalForm(radicalIdx): simplifiedForms.add(simplifiedForm) # add simplified variant simplifiedForms.update( set(cjkSimplified.getKangxiRadicalVariantForms( radicalIdx)) \ - set(cjk.getKangxiRadicalVariantForms(radicalIdx))) for form in simplifiedForms: print ("No table entry for simplified radical %s with index %d'" % (form, radicalIdx)).encode(default_encoding) noEntryCount += 1 for radicalIdx in range(1, 215): otherVariants = set(cjk.getKangxiRadicalVariantForms(radicalIdx)) \ - set(cjkSimplified.getKangxiRadicalVariantForms(radicalIdx)) for form in otherVariants: print ("No table entry for variant %s with index %d'" % (form, radicalIdx)).encode(default_encoding) noEntryCount += 1 print "Total %d entries" % fileEntryCount \ + ", %d missing from cjklib" % databaseMissingEntryCount \ + ", %d mismatches in equivalent forms" % wrongEquivalentCount \ + ", not found in source list: %d" % noEntryCount
def _characterLookup(cls): if not hasattr(cls, '_cjk'): cls._cjk = CharacterLookup('T', 'Unicode') return cls._cjk
def _removePseudoCharacters(self, decompositionEntries, flagEntries): """ Removes all pseudo character entries and subsitutes their occurence by their own entries. """ def substitutePseudoCharacters(decomposition): newDecomposition = [] for c in decomposition: if type(c) != type(()): # IDS newDecomposition.append([[c]]) else: char, _ = c if type(char) == type(0): if c in pseudoCharacterMap: # get all decompositions of this pseudo character newPseudoDecomp = [] for decomp in pseudoCharacterMap[c]: newDecomps = substitutePseudoCharacters(decomp) if newDecomps: newPseudoDecomp.extend(newDecomps) newDecomposition.append(newPseudoDecomp) else: return else: # normal char newDecomposition.append([[c]]) # all combinations of sub-decompositions flatDecomp = set() for newDecomp in cross(*newDecomposition): flatEntry = [] for entry in newDecomp: flatEntry.extend(entry) flatDecomp.add(tuple(flatEntry)) return flatDecomp # find pseude characters first pseudoCharacterMap = {} for char in decompositionEntries: if type(char) == type(0): for glyph in decompositionEntries[char]: pseudoCharacterMap[(char, glyph)] = decompositionEntries[char][glyph] # now apply newDecompositionsEntries = {} newFlagEntries = {} for char in decompositionEntries: if type(char) == type(0): continue newDecompositionsEntries[char] = {} newFlagEntries[char] = {} for glyph in decompositionEntries[char]: newDecompositionsEntries[char][glyph] = set() newFlagEntries[char][glyph] = {} for decomposition in decompositionEntries[char][glyph]: newDecompositions = substitutePseudoCharacters(decomposition) if newDecompositions: newDecompositionsEntries[char][glyph].update(newDecompositions) # transfer flags for newDecomposition in newDecompositions: newFlagEntries[char][glyph][newDecomposition] = flagEntries[char][glyph][decomposition] elif not self.quiet: print >>sys.stderr, ( "Unable to resolve decomposition" + " with pseudo character for '%s': " % char + CharacterLookup.decompositionToString(decomposition) ).encode(default_encoding) return newDecompositionsEntries, newFlagEntries
def compareTrees(decompositionA, decompositionB): """ Checks for similar decomposition trees, taking care of unknown components. Returns C{None} if the trees are not equal, a integer if the trees are similar. If the left tree (decompositionA) should be preferred a negative number is returned, or a positive number for the right tree (decompositionB). If C{0} is returned, both trees are equally good to choose from. """ if not decompositionA and not decompositionB: # equal return 0 elif not decompositionA or not decompositionB: # if all preceding components are the same that shouldn't happen raise ValueError() elif decompositionA[0] == decompositionB[0]: return compareTrees(decompositionA[1:], decompositionB[1:]) elif type(decompositionA[0]) == type(()) and decompositionA[0][0] == u"?": decompositionB = consumeComponent(decompositionB) result = compareTrees(decompositionA[1:], decompositionB) if result is None or result < 0: # unequal or the left side is preferred later on return None else: return +1 elif type(decompositionB[0]) == type(()) and decompositionB[0][0] == u"?": decompositionA = consumeComponent(decompositionA) result = compareTrees(decompositionA, decompositionB[1:]) if result is None or result > 0: # unequal or the right side is preferred later on return None else: return -1 elif CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0]): # No way these decompositions can be equal # (simplified subseq. checking) return None elif CharacterLookup.isIDSOperator(decompositionA[0]): # expand tree B char, glyph = decompositionB[0] if char in decompositionEntries and glyph in decompositionEntries[char]: for decomposition in decompositionEntries[char][glyph]: result = compareTrees(decompositionA, decomposition + decompositionB[1:]) if result is not None and result >= 0: # right side preferred and so do we... # A shorted description is better return 1 return None elif CharacterLookup.isIDSOperator(decompositionB[0]): # expand tree A char, glyph = decompositionA[0] if char in decompositionEntries and glyph in decompositionEntries[char]: for decomposition in decompositionEntries[char][glyph]: result = compareTrees(decomposition + decompositionA[1:], decompositionB) if result is not None and result <= 0: # left side preferred and so do we... # A shorted description is better return -1 return None else: return None
def _mergeSimilarDecompositions(self, decompositionEntries, flagEntries): """ Merges two decompositions, if they are the same, except: - one has an unknown component while the other doesn't, - one has a subtree that is the decomposition of the corresponding component of the other decomposition. """ def consumeComponent(decomposition): """ Consumes a component on the top level, e.g. for 㐯, C{⿱⿱亠吕香} consumes C{⿱亠吕} when given the partial decomposition C{⿱亠吕香}. """ if type(decomposition[0]) == type(()): # consume one component return decomposition[1:] if CharacterLookup.isBinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) return consumeComponent(decomposition) elif CharacterLookup.isTrinaryIDSOperator(decomposition[0]): decomposition = consumeComponent(decomposition[1:]) decomposition = consumeComponent(decomposition) return consumeComponent(decomposition) def compareTrees(decompositionA, decompositionB): """ Checks for similar decomposition trees, taking care of unknown components. Returns C{None} if the trees are not equal, a integer if the trees are similar. If the left tree (decompositionA) should be preferred a negative number is returned, or a positive number for the right tree (decompositionB). If C{0} is returned, both trees are equally good to choose from. """ if not decompositionA and not decompositionB: # equal return 0 elif not decompositionA or not decompositionB: # if all preceding components are the same that shouldn't happen raise ValueError() elif decompositionA[0] == decompositionB[0]: return compareTrees(decompositionA[1:], decompositionB[1:]) elif type(decompositionA[0]) == type(()) and decompositionA[0][0] == u"?": decompositionB = consumeComponent(decompositionB) result = compareTrees(decompositionA[1:], decompositionB) if result is None or result < 0: # unequal or the left side is preferred later on return None else: return +1 elif type(decompositionB[0]) == type(()) and decompositionB[0][0] == u"?": decompositionA = consumeComponent(decompositionA) result = compareTrees(decompositionA, decompositionB[1:]) if result is None or result > 0: # unequal or the right side is preferred later on return None else: return -1 elif CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0]): # No way these decompositions can be equal # (simplified subseq. checking) return None elif CharacterLookup.isIDSOperator(decompositionA[0]): # expand tree B char, glyph = decompositionB[0] if char in decompositionEntries and glyph in decompositionEntries[char]: for decomposition in decompositionEntries[char][glyph]: result = compareTrees(decompositionA, decomposition + decompositionB[1:]) if result is not None and result >= 0: # right side preferred and so do we... # A shorted description is better return 1 return None elif CharacterLookup.isIDSOperator(decompositionB[0]): # expand tree A char, glyph = decompositionA[0] if char in decompositionEntries and glyph in decompositionEntries[char]: for decomposition in decompositionEntries[char][glyph]: result = compareTrees(decomposition + decompositionA[1:], decompositionB) if result is not None and result <= 0: # left side preferred and so do we... # A shorted description is better return -1 return None else: return None for char in decompositionEntries: for glyph in decompositionEntries[char]: idxA = 0 decompositions = list(decompositionEntries[char][glyph]) flagsDict = flagEntries[char][glyph] # Check every decomposition with all others to the right while idxA < len(decompositions): idxB = idxA + 1 while idxB < len(decompositions): try: result = compareTrees(decompositions[idxA], decompositions[idxB]) if result is not None and result == 0: # Entries are equal, we can transfer flags flagsDict[decompositions[idxA]].update(flagsDict[decompositions[idxB]]) del flagsDict[decompositions[idxB]] del decompositions[idxB] elif result is not None and result < 0: del flagsDict[decompositions[idxB]] del decompositions[idxB] elif result is not None and result > 0: del flagsDict[decompositions[idxA]] del decompositions[idxA] # No need for further testing for this decomp break else: # Only increase if the list didn't shift to the # left idxB += 1 except ValueError: print >>sys.stderr, ( "Error comparing decompositions %s and %s" % ( CharacterLookup.decompositionToString(decompositions[idxA]), CharacterLookup.decompositionToString(decompositions[idxB]), ) ).encode(default_encoding) idxB += 1 else: idxA += 1 decompositionEntries[char][glyph] = set(decompositions)
import requests import codecs import gevent from gevent import monkey monkey.patch_all() from bs4 import BeautifulSoup # 汉字拼音识别 from pypinyin import pinyin, lazy_pinyin, Style # 笔划数识别 from cjklib.characterlookup import CharacterLookup cjk = CharacterLookup('C') # 汉字偏旁识别 from lib.component import * reload(sys) sys.setdefaultencoding("utf-8") # 代理配置 proxies = {} class BabyName(): def __init__(self, config={}, name_dict={},
def _removePseudoCharacters(self, decompositionEntries, flagEntries): """ Removes all pseudo character entries and subsitutes their occurence by their own entries. """ def substitutePseudoCharacters(decomposition): newDecomposition = [] for c in decomposition: if type(c) != type(()): # IDS newDecomposition.append([[c]]) else: char, _ = c if type(char) == type(0): if c in pseudoCharacterMap: # get all decompositions of this pseudo character newPseudoDecomp = [] for decomp in pseudoCharacterMap[c]: newDecomps = substitutePseudoCharacters(decomp) if newDecomps: newPseudoDecomp.extend(newDecomps) newDecomposition.append(newPseudoDecomp) else: return else: # normal char newDecomposition.append([[c]]) # all combinations of sub-decompositions flatDecomp = set() for newDecomp in cross(*newDecomposition): flatEntry = [] for entry in newDecomp: flatEntry.extend(entry) flatDecomp.add(tuple(flatEntry)) return flatDecomp # find pseude characters first pseudoCharacterMap = {} for char in decompositionEntries: if type(char) == type(0): for glyph in decompositionEntries[char]: pseudoCharacterMap[(char, glyph)] \ = decompositionEntries[char][glyph] # now apply newDecompositionsEntries = {} newFlagEntries = {} for char in decompositionEntries: if type(char) == type(0): continue newDecompositionsEntries[char] = {} newFlagEntries[char] = {} for glyph in decompositionEntries[char]: newDecompositionsEntries[char][glyph] = set() newFlagEntries[char][glyph] = {} for decomposition in decompositionEntries[char][glyph]: newDecompositions = substitutePseudoCharacters( decomposition) if newDecompositions: newDecompositionsEntries[char][glyph].update( newDecompositions) # transfer flags for newDecomposition in newDecompositions: newFlagEntries[char][glyph][newDecomposition] \ = flagEntries[char][glyph][decomposition] elif not self.quiet: print >> sys.stderr, ("Unable to resolve decomposition" + " with pseudo character for '%s': " % char + CharacterLookup.decompositionToString( decomposition))\ .encode(default_encoding) return newDecompositionsEntries, newFlagEntries
def compareTrees(decompositionA, decompositionB): """ Checks for similar decomposition trees, taking care of unknown components. Returns C{None} if the trees are not equal, a integer if the trees are similar. If the left tree (decompositionA) should be preferred a negative number is returned, or a positive number for the right tree (decompositionB). If C{0} is returned, both trees are equally good to choose from. """ if not decompositionA and not decompositionB: # equal return 0 elif not decompositionA or not decompositionB: # if all preceding components are the same that shouldn't happen raise ValueError() elif decompositionA[0] == decompositionB[0]: return compareTrees(decompositionA[1:], decompositionB[1:]) elif (type(decompositionA[0]) == type(()) and decompositionA[0][0] == u'?'): decompositionB = consumeComponent(decompositionB) result = compareTrees(decompositionA[1:], decompositionB) if result is None or result < 0: # unequal or the left side is preferred later on return None else: return +1 elif (type(decompositionB[0]) == type(()) and decompositionB[0][0] == u'?'): decompositionA = consumeComponent(decompositionA) result = compareTrees(decompositionA, decompositionB[1:]) if result is None or result > 0: # unequal or the right side is preferred later on return None else: return -1 elif (CharacterLookup.isIDSOperator(decompositionA[0]) and CharacterLookup.isIDSOperator(decompositionB[0])): # No way these decompositions can be equal # (simplified subseq. checking) return None elif CharacterLookup.isIDSOperator(decompositionA[0]): # expand tree B char, glyph = decompositionB[0] if (char in decompositionEntries and glyph in decompositionEntries[char]): for decomposition in decompositionEntries[char][glyph]: result = compareTrees( decompositionA, decomposition + decompositionB[1:]) if result is not None and result >= 0: # right side preferred and so do we... # A shorted description is better return 1 return None elif CharacterLookup.isIDSOperator(decompositionB[0]): # expand tree A char, glyph = decompositionA[0] if (char in decompositionEntries and glyph in decompositionEntries[char]): for decomposition in decompositionEntries[char][glyph]: result = compareTrees( decomposition + decompositionA[1:], decompositionB) if result is not None and result <= 0: # left side preferred and so do we... # A shorted description is better return -1 return None else: return None