def get_simp_chars(char_data, Char, DTradChars=None): if type(Char) == int: Char = w_unichr(Char) LSubChars = [] if DTradChars: # convert the converted characters/radicals Traditional-Simplified LVariants = [ 'unihan.simplifiedvariant', 'unihan.semanticvariant', # CHECK ME! 'unihan.specializedsemanticvariant', # CHECK ME! 'unihan.zvariant' ] for Variant in LVariants: V = char_data.raw_data(Variant, w_ord(Char)) if V: LSubChars += [ w_unichr(i) for i in V ] print(('Char:', Char.encode('utf-8'), 'LSubChars:', ''.join(LSubChars).encode('utf-8'), 'V:', V)) # Use the transliteration system to convert from T-S, # only allowing if different and MultiRad data not # already available for that character from multi_translit.translit.my_engine.TranslitEngine import get_engine TradToSimp = get_engine('Chinese Traditional-Simplified') Conv = TradToSimp.convert(Char) LSubChars.append(Conv) # HACK: Make sure characters that are already # Traditional aren't converted so information isn't lost! # NOTE: This shouldn't be performed for RADICAL conversion, # just the CHARS as it can stuff up the radical lists :-( if DTradChars: LSubChars = [ i for i in LSubChars if not i in DTradChars ] return rem_dupes(LSubChars)
def get_L_words(fISOCode, fVariant, Word, Deinflect=False): if fISOCode in ('cmn', 'yue'): # NOTE: Filtering Chinese accents is probably a bad idea # as say "pin" could have multiple headers, so I think # it's best to filter the accents at e.g. a CEDict level # and sort by character frequency :-P # Replace commonly confused PinYin # combinations by Westerners :-P # TODO: Should this be in "Deinflect" (or a separate mode?) if Deinflect or len(Word) > 5: # TODO: What about "similar" mode? --------------------------------------------- R = Word.replace('r', 'l') if R.endswith('dz'): R = R[:-2] + 'zi' # Yale Handz (Hanzi) HACK! if R.endswith('z'): R = R[:-1] + 'i' R = R.replace('e', 'a') R = R.replace('o', 'u') R = R.replace('d', 't') R = R.replace('y', 'i') # OPEN ISSUE: Should this be -> ts? R = R.replace( 'i', 'u') # CONTROVERSIAL - PinYin "i" often sounds like "u"! # replace "ch"-related sounds R = R.replace('ch', 'q') R = R.replace('j', 'q') R = R.replace('zh', 'q') R = R.replace('sh', 'q') # CONTROVERSIAL! R = R.replace('x', 'q') # OPEN ISSUE: Should this be "s" or "c"? # replace "ts"-related sounds #R = R.replace('c', 's') # CONTROVERSIAL! R = R.replace('z', 'c') # Sometimes sounds like "ts" as in "Hanzi"? R = R.replace('ts', 'c') R = R.replace('s', 'q') return (Word, R) #remove_tones(Word)) else: return (Word, ) #elif fISOCode == 'vie': #return (Word, filter_vie_accents(Word, False), filter_vie_accents(Word, True)) elif fISOCode == 'tha': return (Word, filter_thai_accents(Word)) elif fISOCode == 'jpn': # Add Katakana -> Hiragana in no accents mode iConv = Word # ''.join([Conv(i) for i in Word]) # Use romaji for more accurate similar results #Latin = HiraToRomaji(KataToRomaji(iConv)) # TODO: What if the Latin is WRONG? #Latin = '^%s^' % Latin # Make sure Latin titles aren't confused! # Spaces are replaced to fix startswith/endswith queries NoSpaces = iConv.replace('_', '') LRtn = ( Word, NoSpaces, #Latin.lower().replace('_', '') ) oNoSpaces = NoSpaces if not Deinflect and NoSpaces: # Masculine Japanese HACK - # Converts "uzee" into "uzai" etc #print 'NoSpaces:', NoSpaces.encode('utf-8') NoSpaces = unicodedata.normalize('NFC', str(NoSpaces)) # HACK! Override = NoSpaces[-1] in 'ーぇ' or 'しぇ' in NoSpaces # Fix shenshei (sensei) as used in Lucky Star (if I recall correctly) :-P NoSpaces = NoSpaces.replace('しぇ', 'せ') NoSpaces = NoSpaces.replace('ー', 'え') NoSpaces = NoSpaces.replace('ぇ', 'え') # Koeeee (Kowai) HACK! while NoSpaces[-3:] == 'えええ': NoSpaces = NoSpaces[:-1] # Fix Sugeee (Sugoi) etc while NoSpaces[-2:] == 'ええ' and NoSpaces[1] != 'え': NoSpaces = NoSpaces[:-1] if len(NoSpaces) > 1 and ( (NoSpaces[-1] == 'え' and NoSpaces[-2] in DAlt) or Override): if NoSpaces[-2] in DAlt: LAltChars = DAlt[NoSpaces[-2]] else: LAltChars = NoSpaces[-2] for AltChar in LAltChars: Masculine = '%s%s' % (NoSpaces[:-2], AltChar) Masculine = unicodedata.normalize('NFD', Masculine) # HACK! Masculine = Masculine.replace( 'っ', '') # Get rid of 'dekke-' (dekai) etc :-P if is_hanzi(Masculine[0]): # Fix [Ko]wai -> [Kowa]i when first character Kanji KanjiForm = Masculine[0] + Masculine[2:] LRtn += ('%sい' % Masculine, '%sい' % KanjiForm) else: LRtn += ('%sい' % Masculine, ) if Deinflect and NoSpaces: # In deinflect mode, look up possible stems in the character data from mscSentenceJpn import IsKana # HACK! if is_hanzi(oNoSpaces[0]) and IsKana(oNoSpaces[1:]): LKun = CharData.raw_data('Japanese Kun', w_ord(oNoSpaces[0])) if LKun: LKun = unicodedata.normalize('NFD', LKun[0]).replace( '-', '').split(' ') LExtend = [ '%s%s' % (oNoSpaces[0], i.split('.')[1]) for i in LKun if '.' in i ] LExtend = [(fastdist.distance(oNoSpaces, i), i) for i in LExtend] LExtend.sort(key=lambda x: -x[0]) LRtn += tuple([i[1] for i in LExtend]) print((';'.join(LRtn).encode('utf-8'))) return LRtn else: Rtn = (Word, filter_accents(Word)) #print 'get_L_words RTN:', Rtn return Rtn
def is_hanzi(S): Ord = w_ord(S) if Ord >= 0x4E00 and Ord <= 0x9FFF: return 1 elif Ord >= 0x3400 and Ord <= 0x4DBF: return 1 return 0
def _format_data(self, ord_, data): if len(data) > 1: i_ord = ' '.join([get_uni_point(w_ord(i)) for i in data]) return '%s (%s)' % (data, i_ord) else: return None
def html_formatted(self, key, ord_): if isinstance(ord_, str): ord_ = w_ord(ord_) inst = self.get_class_by_property(key) return inst.html_formatted(ord_)
def raw_data(self, key, ord_): if isinstance(ord_, str): ord_ = w_ord(ord_) inst = self.get_class_by_property(key) return inst.raw_data(ord_)
def open_radkfile(self): DRads = rad_k_file.DRads # ???? What about indexing???? DKanji = rad_k_file.DKanji for kanji, LRads in list(DKanji.items()): yield 'multi_radicals', w_ord(kanji), [w_ord(i) for i in LRads]
def open_kanjidic_2(path): ext = path.split('.')[-1].lower() if ext == 'gz': f = gzip.open(path) else: f = open(path, 'rb') # get an iterable iter_ = iterparse(f, events=("end", )) D = {} for event, elem in iter_: tag = elem.tag if tag == 'literal': # Yield the existing character and change it if elem.text: if D: yield D D = {'codepoint': w_ord(elem.text.strip())} elif tag in SIgnored: # Only children useful - ignored # But make sure it actually IS blank! if list(elem.keys()) or (elem.text and elem.text.strip()): print( ('tag Ignore Warning:', tag, list(elem.keys()), elem.text)) elif tag == 'database_version': # The Kanjidic database version # May as well print it print(('Kanjidic2 DB Version:', elem.text)) elif tag == 'date_of_creation': # Likewise print(('Kanjidic2 Date of Creation:', elem.text)) elif tag == 'file_version': # Likewise print(('f Version:', elem.text)) elif tag == 'cp_value': # Codepoint values can be easily grabbed by str.encode('utf-8') # or str.encode('shift-jis') so I won't bother including them pass elif tag == 'dic_ref': # e.g. Morohashi references dic_ref_type = 'dicref_%s' % elem.get('dr_type') if not elem.text or not elem.text.strip(): continue if not dic_ref_type in D: D[dic_ref_type] = [] value = elem.text.strip() if dic_ref_type == 'dicref_moro': # HACK: Convert to a string - see also `dicref_moro` # in the `Indicies` variable in `IndiceBuilder` value = '%s.%s.%s' % ( value, elem.get('m_vol', '0'), elem.get('m_page', '0') ) # HACK! ================================ if dic_ref_type == 'dicref_busy_people': if value.endswith('.A'): value = '%s.0' % ( value[:-2] ) # HACK! ============================================= D[dic_ref_type].append(value) elif tag == 'freq': # Record the Japanese frequency if elem.text: if not 'freq' in D: D['freq'] = [] D['freq'].append(elem.text.strip()) elif tag == 'grade': # Record the Japanese grade if elem.text: if not 'grade' in D: D['grade'] = [] D['grade'].append(elem.text.strip()) elif tag == 'meaning': # WARNING: Specific meanings might be grouped with specific readings with rmgroup! =========== # I don't think Kanjidic differentiates between them *yet* though # Record the Japanese meaning if 'm_lang' in list(elem.keys()): # In English key = 'meaning_%s' % elem.get('m_lang') if not key in D: D[key] = [] if elem.text: D[key].append(elem.text.strip()) else: # In another language if not 'meaning' in D: D['meaning'] = [] if elem.text: D['meaning'].append(elem.text.strip()) elif tag == 'nanori': # Record the Japanese if not 'reading_nanori' in D: D['reading_nanori'] = [] if elem.text: D['reading_nanori'].append(elem.text.strip()) elif tag == 'q_code': # Input codes, e.g. SKIP and Four Corners # I've removed SKIP for now for licensing reasons query_code_type = elem.get('qc_type') if query_code_type == 'skip': continue query_code_type = 'querycode_%s' % query_code_type if not query_code_type in D: D[query_code_type] = [] if elem.text: D[query_code_type].append(elem.text.strip()) elif tag == 'rad_name': # Record the radical's name (if the character used # as a radical) in Japanese # TODO: This should be a StringData! if not 'rad_name' in D: D['rad_name'] = [] if elem.text: D['rad_name'].append(elem.text) elif tag == 'rad_value': radical_type = elem.get('rad_type') key = 'rad_%s' % radical_type if not key in D: D[key] = [] if elem.text: D[key].append(elem.text.strip()) elif tag == 'reading': # The reading, e.g. pinyin/ja_on etc reading_type = 'reading_%s' % elem.get('r_type') if not reading_type in D: D[reading_type] = [] if elem.text: D[reading_type].append(elem.text.strip()) elif tag == 'stroke_count': # The total Japanese stroke count if elem.text: if not 'stroke_count' in D: D['stroke_count'] = [] D['stroke_count'].append(elem.text.strip()) elif tag == 'variant': # Variant forms of this character key = 'crossref_%s' % elem.get('var_type') if elem.text: if not key in D: D[key] = [] D[key].append(elem.text.strip()) elif tag == 'jlpt': # JLPT level if not 'jlpt' in D: D['jlpt'] = [] D['jlpt'].append(int(elem.text.strip())) else: print(('WARNING:', tag, elem)) f.close()