def init_cangjie(self): version = self.settings.get_int("version") version = getattr(cangjie.versions, "CANGJIE%d"%version) filters = (cangjie.filters.BIG5 | cangjie.filters.HKSCS | cangjie.filters.PUNCTUATION) if self.settings.get_boolean("include-allzh"): filters |= cangjie.filters.CHINESE if self.settings.get_boolean("include-jp"): filters |= cangjie.filters.KANJI filters |= cangjie.filters.HIRAGANA filters |= cangjie.filters.KATAKANA if self.settings.get_boolean("include-zhuyin"): filters |= cangjie.filters.ZHUYIN if self.settings.get_boolean("include-symbols"): filters |= cangjie.filters.SYMBOLS self.cangjie = cangjie.Cangjie(version, filters)
import cangjie import numpy as np """ reverse character look up. aka look up the Cangjie code from a character. This will take 3-10 minutes. Using Debian's pycangjie library: https://salsa.debian.org/input-method-team/pycangjie """ # and I OOP (Object-oriented programming) cj = cangjie.Cangjie(cangjie.versions.CANGJIE5, cangjie.filters.CHINESE) lookup_list=list(range(97, 123)) # the look up list is the list of ascii codes to convert to letters so the letter string can be used to look up a character. # this is responsible for a single letter. combine_list=[0] # these are indicies of lookup_list. The combine list creates the look up string by converting individual numbers from # lookup list to letters. The resulting look up string is used to find the characters. ## # ooooh boy this is gonna be fun # chinese characters unicode range: 19968 -> 195103. Cangjie may not cover the entire range, but better be sure. # using numpy to save space lookup_table=np.empty_like(['abcde'], dtype="<U5", shape=(175135,)) lookup_freqs=np.empty_like(['abcde'], dtype=int, shape=(175135,)) ## character_count=0
def setUp(self): self.cj = cangjie.Cangjie(self.version, self.language)