def getChars(freqFile, startNo, endNo): chars = [] reader = unicode_csv_reader(codecs.open(freqFile, 'rb', "utf-8"), dialect='excel-tab') frequencyList = [x for x in reader] #read the whole list frequencyList = frequencyList[startNo:endNo] for row in frequencyList: templist = list(row[i] for i in [1, 4, 5]) pinyin = ReadingFactory() readings = templist[1].split('/') # print readings readingString = "" for reading in readings: readingString += pinyin.convert(reading, 'Pinyin', 'Pinyin', sourceOptions={ 'toneMarkType': 'numbers', 'missingToneMark': 'fifth' }) + " " templist[1] = readingString chars.append(templist) return chars
def getReadingOperator(readingName, readingOptions={}): global _readingOperator if not _readingOperator: readingFactory = ReadingFactory() _readingOperator = readingFactory.createReadingOperator(readingName, **readingOptions) return _readingOperator
def getReadingOperator(readingName, readingOptions={}): global _readingOperator if not _readingOperator: readingFactory = ReadingFactory() _readingOperator = readingFactory.createReadingOperator(readingName, **readingOptions) return _readingOperator
def runTests(tests, databases, registerUnicode, iteration=10): f = ReadingFactory() timing = {} for no in tests: print "Running test %d (reading from %s)..." % (no, databases[no]) connection = {'sqlalchemy.url': 'sqlite:///%s' % databases[no], 'attach': ['cjklib'], 'registerUnicode': registerUnicode[no]} db = dbconnector.getDBConnector(connection) availableDicts = [dictClass.DICTIONARY_TABLE for dictClass in dictionary.BaseDictionary\ .getAvailableDictionaries(db)] dictionaries = list(set(availableDicts) & set(db.engine.table_names(schema=db._mainSchema))) if not dictionaries: raise ValueError("No dictionaries found") print "Found dictionaries '%s'" % "', '".join(dictionaries) runTime = {} for dictName in dictionaries: dictClass = dictionary.BaseDictionary.getDictionaryClass(dictName) dictInstance = dictClass(dbConnectInst=db) opClass = (dictClass.READING and f.getReadingOperatorClass(dictClass.READING)) if hasattr(opClass, 'guessReadingDialect'): requestList = [] for request in SEARCH_REQUESTS: options = opClass.guessReadingDialect(request) requestList.append((request, options)) else: requestList = [(request, {}) for request in SEARCH_REQUESTS] mod = imp.new_module('timeit_runmod') mod.runRequest = runRequest mod.dictInstance = dictInstance mod.requestList = requestList sys.modules['timeit_runmod'] = mod methodTime = {} for method in ('getFor', 'getForHeadword', 'getForReading', 'getForTranslation'): t = Timer("""timeit_runmod.runRequest( timeit_runmod.dictInstance, timeit_runmod.requestList, method='%s') """ % method, "import timeit_runmod") methodTime[method] = t.timeit(iteration) runTime[dictName] = methodTime timing[no] = runTime return timing
def __init__(self, fromReading, toReading, variant=None, **options): self.id = '%s-%s' % (fromReading, toReading) if variant: self.id += '/' + variant icu.Transliterator.__init__(self, self.id) self._conv = ReadingFactory().createReadingConverter( fromReading, toReading, **options)
class ReadingConverterTest(NeedsDatabaseTest): """ Base class for testing of :class:`~cjklib.reading.converter.ReadingConverter` classes.""" CONVERSION_DIRECTION = None """Tuple of reading names for conversion from reading A to reading B.""" def setUp(self): NeedsDatabaseTest.setUp(self) self.fromReading, self.toReading = self.CONVERSION_DIRECTION for clss in self.getReadingConverterClasses().values(): if self.CONVERSION_DIRECTION in clss.CONVERSION_DIRECTIONS: self.readingConverterClass = clss break else: self.readingConverterClass = None self.f = ReadingFactory(dbConnectInst=self.db) def shortDescription(self): methodName = getattr(self, self.id().split('.')[-1]) # get whole doc string and remove superfluous white spaces noWhitespaceDoc = re.sub('\s+', ' ', methodName.__doc__.strip()) # remove markup for epytext format clearName = re.sub('[CLI]\{([^\}]*)}', r'\1', noWhitespaceDoc) # add information about conversion direction return clearName + ' (for %s to %s)' % self.CONVERSION_DIRECTION @staticmethod def getReadingConverterClasses(): """ Gets all classes from the reading module that implement :class:`~cjklib.reading.converter.ReadingConverter`. :rtype: dictionary of string class pairs :return: dictionary of all classes inheriting form :class:`~cjklib.reading.converter.ReadingConverter` """ readingConverterClasses = {} # get all non-abstract classes that inherit from ReadingConverter readingConverterClasses = dict([(clss.__name__, clss) \ for clss in converter.__dict__.values() \ if type(clss) in [types.TypeType, types.ClassType] \ and issubclass(clss, converter.ReadingConverter) \ and clss.CONVERSION_DIRECTIONS]) return readingConverterClasses def tearDown(self): # get rid of the possibly > 1000 instances self.f.clearCache()
def setUp(self): NeedsDatabaseTest.setUp(self) self.fromReading, self.toReading = self.CONVERSION_DIRECTION for clss in self.getReadingConverterClasses().values(): if self.CONVERSION_DIRECTION in clss.CONVERSION_DIRECTIONS: self.readingConverterClass = clss break else: self.readingConverterClass = None self.f = ReadingFactory(dbConnectInst=self.db)
def setUp(self): NeedsDatabaseTest.setUp(self) self.f = ReadingFactory(dbConnectInst=self.db) try: import PyICU self.toNumeric = PyICU.Transliterator.createInstance( "Latin-NumericPinyin", PyICU.UTransDirection.UTRANS_FORWARD) self.fromNumeric = self.toNumeric.createInverse() except ImportError: pass
class ReadingConversion(Base): """Converts the entries' reading string to the given target reading.""" def __init__(self, toReading=None, targetOptions=None): """ Constructs the conversion strategy. :type toReading: str :param toReading: target reading, if omitted, the dictionary's reading is assumed. :type targetOptions: dict :param targetOptions: target reading conversion options """ Base.__init__(self) self.toReading = toReading if targetOptions: self.targetOptions = targetOptions else: self.targetOptions = {} def setDictionaryInstance(self, dictInstance): super(ReadingConversion, self).setDictionaryInstance(dictInstance) if (not hasattr(self._dictInstance, 'READING') or not hasattr(self._dictInstance, 'READING_OPTIONS')): raise ValueError('Incompatible dictionary') self.fromReading = self._dictInstance.READING self.sourceOptions = self._dictInstance.READING_OPTIONS self._readingFactory = ReadingFactory( dbConnectInst=self._dictInstance.db) toReading = self.toReading or self.fromReading if not self._readingFactory.isReadingConversionSupported( self.fromReading, toReading): raise ValueError("Conversion from '%s' to '%s' not supported" % (self.fromReading, toReading)) def format(self, string): toReading = self.toReading or self.fromReading try: return self._readingFactory.convert( string, self.fromReading, toReading, sourceOptions=self.sourceOptions, targetOptions=self.targetOptions) except (exception.DecompositionError, exception.CompositionError, exception.ConversionError): return None
def testEveryConverterHasConsistencyTest(self): """ Check if every reading has a test case. """ testClasses = self.getReadingConverterConsistencyTestClasses() testClassReadingNames = [clss.CONVERSION_DIRECTION for clss \ in testClasses] self.f = ReadingFactory(dbConnectInst=self.db) for clss in self.f.getReadingConverterClasses(): for direction in clss.CONVERSION_DIRECTIONS: self.assert_(direction in testClassReadingNames, "Conversion from %s to %s" % direction \ + "has no ReadingOperatorConsistencyTest")
def _decomposeAndRemovePinyinTones(string, type='diacritics'): if string is None: return None if not isinstance(string, unicode): string = unicode(string, 'utf-8') # print "isinstance of unique: " + keyword from cjklib.reading import ReadingFactory rf = ReadingFactory() readings = rf.decompose(string, 'Pinyin') readings = [rf.convert(string, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': type}, targetOptions={'toneMarkType': 'none'}).lower().replace(u'ü', u'v') for string in readings] readings = [r for r in readings if r != ' ' and r != "'"] return readings
class ReadingConversion(Base): """Converts the entries' reading string to the given target reading.""" def __init__(self, toReading=None, targetOptions=None): """ Constructs the conversion strategy. :type toReading: str :param toReading: target reading, if omitted, the dictionary's reading is assumed. :type targetOptions: dict :param targetOptions: target reading conversion options """ Base.__init__(self) self.toReading = toReading if targetOptions: self.targetOptions = targetOptions else: self.targetOptions = {} def setDictionaryInstance(self, dictInstance): super(ReadingConversion, self).setDictionaryInstance( dictInstance) if (not hasattr(self._dictInstance, 'READING') or not hasattr(self._dictInstance, 'READING_OPTIONS')): raise ValueError('Incompatible dictionary') self.fromReading = self._dictInstance.READING self.sourceOptions = self._dictInstance.READING_OPTIONS self._readingFactory = ReadingFactory( dbConnectInst=self._dictInstance.db) toReading = self.toReading or self.fromReading if not self._readingFactory.isReadingConversionSupported( self.fromReading, toReading): raise ValueError("Conversion from '%s' to '%s' not supported" % (self.fromReading, toReading)) def format(self, string): toReading = self.toReading or self.fromReading try: return self._readingFactory.convert(string, self.fromReading, toReading, sourceOptions=self.sourceOptions, targetOptions=self.targetOptions) except (exception.DecompositionError, exception.CompositionError, exception.ConversionError): # wighack return string
class ChineseLessonsComCantonesePronunciation(GlobbingPronunciationBuilder): """ Builds an index on pronunciation files for Cantonese provided by chinese-lessions.com. """ PROVIDES = "Pronunciation_CantoneseYale" DEPENDS = ['CantoneseYaleSyllables'] BASE_DIRECTORY_NAME = "chineselessionscom_yue" TONE_ABBREV = {'HT': '1stToneLevel', 'HF': '1stToneFalling', 'MR': '2ndTone', 'MT': '3rdTone', 'LF': '4thTone', 'LR': '5thTone', 'LT': '6thTone'} def __init__(self, **options): super(ChineseLessonsComCantonesePronunciation, self).__init__(**options) self.readingFactory = ReadingFactory() def getReadingFromFileName(self, fileName): fileRoot, _ = os.path.splitext(fileName) matchObj = re.match('([a-z]+)(HT|HF|MR|MT|LF|LR|LT)$', fileRoot) if matchObj: plainSyllable, toneMarker = matchObj.groups([1, 2]) toneNumber = self.TONE_ABBREV[toneMarker] try: return self.readingFactory.getTonalEntity(plainSyllable, toneNumber, 'CantoneseYale') except exception.UnsupportedError: pass except exception.ConversionError: pass
def fix_pinyin(self, pinyin): # Hacks. It is overkill to ship cjklib with this add-on. But # to get the tone numbers as numbers, we should use it. My # hope (guess) is that the typical user that will want Chinese # pronunciations will also have TTEMPÉ's (version of mine) # chinese-support-plugin installed. So try to use that and # don't complain if it doesn't work. if not self.have_tried_cjklib_hack: try: # If this works, the whole shebang is run as an Anki2 # add-on. If not, we will still look for a system-wide # cjklib, but obviously not for anothre add-on. from aqt.utils import isWin except: pass else: from aqt import mw addon_dir = mw.pm.addonFolder() if isWin: # The isWin bit is copied from TTEMPÉ's code. addon_dir = addon_dir.encode(sys.getfilesystemencoding()) sys.path.append(os.path.join(addon_dir, "chinese")) self.have_tried_cjk_hack = True if not self.reading_factory: try: from cjklib.reading import ReadingFactory except ImportError: return pinyin else: self.reading_factory = ReadingFactory() return self.reading_factory.convert( pinyin, 'Pinyin', 'Pinyin', targetOptions={ 'toneMarkType': 'numbers'}).replace('5', '0')
def __init__(self, configfile): super(BKRS2DB, self).__init__() #statprof.start() self.get_config(configfile) self.comma_symbols = [u',', u'﹐', ','] self.BUFFER_SIZE = 10000 self.buffer_index = 0 self.read_fab = ReadingFactory() self.cedict = CEDICT() self.cjk = characterlookup.CharacterLookup('T') self.pinyinOp = self.read_fab.createReadingOperator('Pinyin') self.charInfo = cjknife.CharacterInfo() self.last_error = {'description':'', 'match':'', 'not_match': ''} self.bad_word_index = 0 self.additional_reading = {} self.hanzi_stat = {} self.hanzi_freq = {} self.hanzi_pron_var = {} self.errors_description = { 'pinyin_not_match':'Не совпадает', 'no_pinyin':'Нет чтения', 'pinyin_have_tag_symbol':'В пиньине теги', 'pinyin_have_bad_symbol':'В пиньине плохие символы', 'pinyin_have_rus_letter':'В пиньине русские буквы', 'pinyin_have_number_symbol':'В пиньине цифры', 'word_have_alpha_symbol':'В слове alfa символы' } self.log_file = open(self.params['log_file'], 'w', 1000) if self.params['write_to_pleco_db']: self.pleco = Pleco(self.params['output_pleco_database_file'], self) self.bad_hanzi_list = False
def setDictionaryInstance(self, dictInstance): super(ReadingConversion, self).setDictionaryInstance(dictInstance) if (not hasattr(self._dictInstance, 'READING') or not hasattr(self._dictInstance, 'READING_OPTIONS')): raise ValueError('Incompatible dictionary') self.fromReading = self._dictInstance.READING self.sourceOptions = self._dictInstance.READING_OPTIONS self._readingFactory = ReadingFactory( dbConnectInst=self._dictInstance.db) toReading = self.toReading or self.fromReading if not self._readingFactory.isReadingConversionSupported( self.fromReading, toReading): raise ValueError("Conversion from '%s' to '%s' not supported" % (self.fromReading, toReading))
def __init__(self, fromReading, toReading, variant=None, **options): self.id = '%s-%s' % (fromReading, toReading) if variant: self.id += '/' + variant icu.Transliterator.__init__(self, self.id) self._conv = ReadingFactory().createReadingConverter(fromReading, toReading, **options)
def open (self, dbname): """Open the database.""" self.dbname = dbname if not hasattr(self, '_dictionaryName'): self._dictionaryName = dbname try: self._dictInst = getDictionary(self._dictionaryName, entryFactory=entry.UnifiedHeadword()) except ValueError as e: if debug: print(e, file=sys.stderr) return False if self._dictInst.READING: f = ReadingFactory() opClass = f.getReadingOperatorClass(self._dictInst.READING) if hasattr(opClass, 'guessReadingDialect'): self._opClass = opClass return True
class PinyinICUTest(NeedsDatabaseTest, unittest.TestCase): """Test Pinyin tonemark conversion on ICU transformation rule.""" CONVERSION_DIRECTION = ('Pinyin', 'Pinyin') def setUp(self): NeedsDatabaseTest.setUp(self) self.f = ReadingFactory(dbConnectInst=self.db) try: import PyICU self.toNumeric = PyICU.Transliterator.createInstance( "Latin-NumericPinyin", PyICU.UTransDirection.UTRANS_FORWARD) self.fromNumeric = self.toNumeric.createInverse() except ImportError: pass def testToneMarkPlacement(self): """Test Pinyin tonemark conversion on ICU transformation rule.""" if not hasattr(self, 'toNumeric'): return for readingEntity in self.f.getReadingEntities('Pinyin'): if readingEntity in (u'hn\u0304g', u'h\u0144g', u'h\u0148g', u'h\u01f9g', u'n\u0304g', u'\u0144g', u'\u0148g', u'\u01f9g'): continue targetEntity = self.f.convert(readingEntity, 'Pinyin', 'Pinyin', targetOptions={'toneMarkType': 'numbers', 'missingToneMark': 'fifth'}) self.assertEquals(targetEntity, self.toNumeric.transliterate(readingEntity)) for readingEntity in self.f.getReadingEntities('Pinyin', toneMarkType='numbers', missingToneMark='fifth'): if readingEntity in ('hng1', 'hng2', 'hng3', 'hng4', 'ng1', 'ng2', 'ng3', 'ng4', u'ê1', u'ê2', u'ê3', u'ê4'): continue targetEntity = self.f.convert(readingEntity, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers', 'missingToneMark': 'fifth'}) self.assertEquals(targetEntity, self.fromNumeric.transliterate(readingEntity))
def getChars(freqFile,startNo,endNo): chars = [] reader=unicode_csv_reader(codecs.open(freqFile, 'rb',"utf-8"), dialect='excel-tab') frequencyList = [x for x in reader] #read the whole list frequencyList = frequencyList[startNo:endNo] for row in frequencyList: templist = list(row[i] for i in [1,4,5]) pinyin = ReadingFactory() readings = templist[1].split('/') # print readings readingString = "" for reading in readings: readingString += pinyin.convert(reading, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers','missingToneMark': 'fifth'}) +" " templist[1] = readingString chars.append(templist) return chars
def fix_pinyin(self, pinyin): # Hacks. It is overkill to ship cjklib with this add-on. But # to get the tone numbers as numbers, we should use it. My # hope (guess) is that the typical user that will want Chinese # pronunciations will also have TTEMPÉ's (version of mine) # chinese-support-plugin installed. So try to use that and # don't complain if it doesn't work. if not self.have_tried_cjklib_hack: try: # If this works, the whole shebang is run as an Anki2 # add-on. If not, we will still look for a system-wide # cjklib, but obviously not for anothre add-on. from aqt.utils import isWin except: pass else: from aqt import mw addon_dir = mw.pm.addonFolder() if isWin: # The isWin bit is copied from TTEMPÉ's code. addon_dir = addon_dir.encode(sys.getfilesystemencoding()) sys.path.append(os.path.join(addon_dir, "chinese")) self.have_tried_cjk_hack = True if not self.reading_factory: try: from cjklib.reading import ReadingFactory except ImportError: return pinyin else: self.reading_factory = ReadingFactory() return self.reading_factory.convert(pinyin, 'Pinyin', 'Pinyin', targetOptions={ 'toneMarkType': 'numbers' }).replace('5', '0')
class ReadingTransliterator(icu.Transliterator): def __init__(self, fromReading, toReading, variant=None, **options): self.id = '%s-%s' % (fromReading, toReading) if variant: self.id += '/' + variant icu.Transliterator.__init__(self, self.id) self._conv = ReadingFactory().createReadingConverter( fromReading, toReading, **options) def handleTransliterate(self, text, position, complete): substring = str(text[position.start:position.limit]) converted = self._conv.convert(substring) text[position.start:position.limit] = converted lenDiff = len(substring) - len(converted) position.limit -= lenDiff position.contextLimit -= lenDiff position.start = position.limit @staticmethod def register(fromReading, toReading, variant=None, registerInverse=False, **options): trans = ReadingTransliterator(fromReading, toReading, variant=variant, **options) icu.Transliterator.registerInstance(trans) if registerInverse: inverseOptions = options.copy() inverseOptions['targetOptions'] = options.get('sourceOptions', {}) inverseOptions['sourceOptions'] = options.get('targetOptions', {}) invTrans = ReadingTransliterator(toReading, fromReading, variant=variant, **inverseOptions) icu.Transliterator.registerInstance(invTrans) return trans.id
def setDictionaryInstance(self, dictInstance): super(ReadingConversion, self).setDictionaryInstance( dictInstance) if (not hasattr(self._dictInstance, 'READING') or not hasattr(self._dictInstance, 'READING_OPTIONS')): raise ValueError('Incompatible dictionary') self.fromReading = self._dictInstance.READING self.sourceOptions = self._dictInstance.READING_OPTIONS self._readingFactory = ReadingFactory( dbConnectInst=self._dictInstance.db) toReading = self.toReading or self.fromReading if not self._readingFactory.isReadingConversionSupported( self.fromReading, toReading): raise ValueError("Conversion from '%s' to '%s' not supported" % (self.fromReading, toReading))
class ReadingConverterTestCaseCheck(NeedsDatabaseTest, unittest.TestCase): """ Checks if every :class:`~cjklib.reading.converter.ReadingConverter` has its own :class:`~cjklib.test.readingconverter.ReadingConverterConsistencyTest`. """ def testEveryConverterHasConsistencyTest(self): """ Check if every reading has a test case. """ testClasses = self.getReadingConverterConsistencyTestClasses() testClassReadingNames = [clss.CONVERSION_DIRECTION for clss \ in testClasses] self.f = ReadingFactory(dbConnectInst=self.db) for clss in self.f.getReadingConverterClasses(): for direction in clss.CONVERSION_DIRECTIONS: self.assert_(direction in testClassReadingNames, "Conversion from %s to %s" % direction \ + "has no ReadingOperatorConsistencyTest") @staticmethod def getReadingConverterConsistencyTestClasses(): """ Gets all classes implementing :class:`cjklib.test.readingconverter.ReadingConverterConsistencyTest`. :rtype: list :return: list of all classes inheriting form :class:`cjklib.test.readingconverter.ReadingConverterConsistencyTest` """ # get all non-abstract classes that inherit from # ReadingConverterConsistencyTest testModule = __import__("cjklib.test.readingconverter") testClasses = [clss for clss \ in testModule.test.readingconverter.__dict__.values() \ if type(clss) in [types.TypeType, types.ClassType] \ and issubclass(clss, ReadingConverterConsistencyTest) \ and clss.CONVERSION_DIRECTION] return testClasses
class ReadingTransliterator(icu.Transliterator): def __init__(self, fromReading, toReading, variant=None, **options): self.id = '%s-%s' % (fromReading, toReading) if variant: self.id += '/' + variant icu.Transliterator.__init__(self, self.id) self._conv = ReadingFactory().createReadingConverter(fromReading, toReading, **options) def handleTransliterate(self, text, position, complete): substring = unicode(text[position.start:position.limit]) converted = self._conv.convert(substring) text[position.start:position.limit] = converted lenDiff = len(substring) - len(converted) position.limit -= lenDiff position.contextLimit -= lenDiff position.start = position.limit @staticmethod def register(fromReading, toReading, variant=None, registerInverse=False, **options): trans = ReadingTransliterator(fromReading, toReading, variant=variant, **options) icu.Transliterator.registerInstance(trans) if registerInverse: inverseOptions = options.copy() inverseOptions['targetOptions'] = options.get('sourceOptions', {}) inverseOptions['sourceOptions'] = options.get('targetOptions', {}) invTrans = ReadingTransliterator(toReading, fromReading, variant=variant, **inverseOptions) icu.Transliterator.registerInstance(invTrans) return trans.id
class ChineseLessonsComMandarinPronunciation(GlobbingPronunciationBuilder): """ Builds an index on pronunciation files for Mandarin provided by chinese-lessions.com. """ PROVIDES = "Pronunciation_Pinyin" DEPENDS = ['PinyinSyllables'] BASE_DIRECTORY_NAME = "chineselessionscom_cmn" def __init__(self, **options): super(ChineseLessonsComMandarinPronunciation, self).__init__(**options) self.readingFactory = ReadingFactory() def getReadingFromFileName(self, fileName): fileRoot, _ = os.path.splitext(fileName) try: return self.readingFactory.convert(fileRoot, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers'}) except exception.UnsupportedError: pass except exception.ConversionError: pass
class LeoDownloader(AudioDownloader): """Download audio from LEO""" def __init__(self): AudioDownloader.__init__(self) self.file_extension = u'.mp3' self.url = 'http://www.leo.org/dict/audio_{language}/{word}.mp3' # And, yes, they use ch for Chinese. # (I'm not sure if they really have anything for ru or it.) self.language_dict = {'de': 'de', 'en': 'en', 'es': 'es', 'fr': 'fr', 'it': 'it', 'ru': 'ru', 'zh': 'ch'} # It kind of looks like they have Swiss pronunciations, but hey don't. self.chinese_code = 'ch' # We should keep a number of site icons handy, with the right # flag for the request. self.site_icon_dict = {} self.site_file_name_encoding = 'ISO-8859-1' self.icon_url_dict = { 'de': 'http://dict.leo.org/favicon.ico', 'en': 'http://dict.leo.org/favicon.ico', 'es': 'http://dict.leo.org/favicon_es.ico', 'fr': 'http://dict.leo.org/favicon_fr.ico', 'it': 'http://dict.leo.org/favicon_it.ico', 'ru': 'http://dict.leo.org/favicon_ru.ico', # When we use this dict, we have already munged the 'zh' to 'ch' 'ch': 'http://dict.leo.org/favicon_ch.ico'} # As the name implies, a hack. Try to use the cjklib TTEMPÉ # brings along. A syntem-wide installed one should work as # well. self.have_tried_cjklib_hack = False self.reading_factory = None def download_files(self, word, base, ruby, split): """ Download a word from LEO We try to get pronunciations for the text for German, English, Spanish, French, Italian and Russian, and from the ruby for Chinese. There may not be any pronunciations available for Italian or Russian. """ self.downloads_list = [] # Fix the language. EAFP. self.language = self.language_dict[self.language[:2].lower()] # set_names also checks the language. self.set_names(word, base, ruby) # Only get the icon when we have a word # self.maybe_get_icon() self.get_flag_icon() # EAFP. self.query_url may return None... word_url = self.query_url(word, ruby) # ... then the get_data will blow up word_data = self.get_data_from_url(word_url) word_file_path, word_file_name = self.get_file_name() with open(word_file_path, 'wb') as word_file: word_file.write(word_data) # We have a file, but not much to say about it. self.downloads_list.append( (word_file_path, word_file_name, dict(Source='Leo'))) def query_url(self, word, ruby): """Build query URL""" if self.chinese_code == self.language: word = self.fix_pinyin(ruby) return self.url.format( language=self.language, word=urllib.quote(word.encode( self.site_file_name_encoding))) def fix_pinyin(self, pinyin): # Hacks. It is overkill to ship cjklib with this add-on. But # to get the tone numbers as numbers, we should use it. My # hope (guess) is that the typical user that will want Chinese # pronunciations will also have TTEMPÉ's (version of mine) # chinese-support-plugin installed. So try to use that and # don't complain if it doesn't work. if not self.have_tried_cjklib_hack: try: # If this works, the whole shebang is run as an Anki2 # add-on. If not, we will still look for a system-wide # cjklib, but obviously not for anothre add-on. from aqt.utils import isWin except: pass else: from aqt import mw addon_dir = mw.pm.addonFolder() if isWin: # The isWin bit is copied from TTEMPÉ's code. addon_dir = addon_dir.encode(sys.getfilesystemencoding()) sys.path.append(os.path.join(addon_dir, "chinese")) self.have_tried_cjk_hack = True if not self.reading_factory: try: from cjklib.reading import ReadingFactory except ImportError: return pinyin else: self.reading_factory = ReadingFactory() return self.reading_factory.convert( pinyin, 'Pinyin', 'Pinyin', targetOptions={ 'toneMarkType': 'numbers'}).replace('5', '0') def get_flag_icon(self): """ Set self.site_icon to the right icon. We should use different icons, depending on the request language. We store these icons in self.site_icon_dict and use the AudioDownloader.maybe_get_icon() if we don't have it yet. """ if not with_pyqt: return try: # If this works we already have it. self.site_icon = self.site_icon_dict[self.language] except KeyError: # We have to get it ourself. (We know it's just 16x16, so # no resize. And we know the address). self.site_icon_dict[self.language] = \ QImage.fromData(self.get_data_from_url( self.icon_url_dict[self.language])) self.site_icon = self.site_icon_dict[self.language] def set_names(self, text, base, ruby): """ Set the display text and file base name variables. """ if self.language == self.chinese_code: if not ruby: raise ValueError('Nothing to download') self.base_name = u"{0}_{1}".format(base, ruby) self.display_text = u"{1} ({0})".format(base, ruby) else: if not text: raise ValueError('Nothing to download') self.base_name = text self.display_text = text
def handle_noargs(self, **options): # EXAMPLE: 一中一台 [yi1 Zhong1 yi1 Tai2] /first meaning/second meaning/ file = open(settings.DICT_FILE_LOCATION) r_server = _get_redis() # EMPTY ALL EN KEYS FROM THE DATABASE item_count = 0 keys = r_server.keys('EN:*') for x in keys: r_server.delete(x) item_count += 1 print "Deleted %s items" % item_count # NOW LETS START item_count = 0 for line in file: if not line.startswith("#"): # GATHER ALL THE MAIN VARIABLES new = line.split() characters = new[1] numbered_pinyin = line[(line.index('[')+1):(line.index(']'))] f = ReadingFactory() tonal_pinyin = f.convert(numbered_pinyin, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'v', 'missingToneMark': 'fifth'}) meanings = line[(line.index('/')+1):(line.rindex('/'))] # CREATE AN INDEX: What we'll do first is try to strip out # as much crap as possible from each definition, and as close as # possible find a single word that we can index on. for x in meanings.split('/'): ns = x # new_string # REMOVE ANYTHING BETWEEN BRACKETS try: ns = ns.replace(ns[(ns.index('(')+1):(ns.index(')'))], '') ns = ns.replace('(', '').replace(')', '') #replace the brackets too except ValueError: pass # REMOVE ANYTHING BETWEEN SQUARE BRACKETS try: ns = ns.replace(ns[(ns.index('[')+1):(ns.index(']'))], '') ns = ns.replace('[', '').replace(']', '') #replace the brackets too except ValueError: pass # IGNORE THE MEANING IF IT CONTAINS AN EXCLUDED PHRASE if len(filter(lambda y: y not in ns, EXCLUSIONS)) != len(EXCLUSIONS): continue # IF THE MEANING IS NOW EMPTY, IGNORE IT ns = ns.strip() if ns == '': continue # DEAL WITH INFINITIVE VERBS LIKE "TO DO" WITH 2 WORDS if len(ns.split(' ')) <= 3 and ns.startswith('to '): ns = ns.split(' ', 1)[1] # REMOVE ITEMS LIKE "SEE XYZ" if ns.split(' ')[0] == 'see' and ns[-1] not in string.ascii_letters: continue # THERE'S ALSO SOME ANNOYING "..." MARKS TOO if "..." in ns: ns = ns.replace('...', '') # FOR NOW, JUST ADD ITEMS WITH 2 WORDs if len(ns.split(' ')) <= 3: key = "EN:%sW:%s" % (len(ns.split(' ')), ns.lower()) print key if r_server.exists(key): values = json.loads(_search_redis(key)) values['characters'].append(characters) r_server.set(key, json.dumps(values)) else: values = { 'english': x, 'characters': [characters,], } r_server.set(key, json.dumps(values)) item_count += 1 print item_count #if item_count > 20: # break print "%s English dictionary items added" % item_count file.close()
class NTrain(Tk.Tk): def __init__(self, *args, **kwargs): Tk.Tk.__init__(self, *args, **kwargs) self.title("Ntrain") # place window in the center self.eval('tk::PlaceWindow %s center' % self.winfo_pathname(self.winfo_id())) self._default_font = tkFont.nametofont("TkDefaultFont") self._default_font.configure(size=30) # define default dataset self._defaultfile = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'chinese100.xlsx') # load default filename into label basename = os.path.basename(self._defaultfile) self._filename_value = Tk.StringVar() self._sett_fn_label = Tk.Entry(textvariable=self._filename_value, font=self._default_font, width=12) self._filename_value.set(basename) self._sett_fn_label.grid(row=1, column=0, sticky=Tk.W) # button to browse for datafile self.browse = Tk.Button(self, text="Browse", command=self._get_file) self.browse.grid(row=1, column=1, sticky=Tk.W) # OK button to start game self._reset_button = Tk.Button(text="Reset", command=self._reset_list) self._reset_button.grid(row=1, column=2) # label self._sett_label = Tk.Label(text="Number of Cards:") self._sett_label.grid(row=2, column=0, sticky=Tk.E) # entry field for number of cards entryText = Tk.StringVar() self._sett_entry = Tk.Entry(textvariable=entryText, font=self._default_font, width=3) entryText.set("30") self._sett_entry.grid(row=2, column=1, sticky=Tk.W) self._sett_entry.focus_set() # reverse option self._radio_val = Tk.IntVar() self._radio1 = Tk.Radiobutton(text="Ch to E", variable=self._radio_val, value=1) self._radio1.grid(row=4, column=0) self._radio2 = Tk.Radiobutton(text="E to Ch", variable=self._radio_val, value=2) self._radio2.grid(row=4, column=1) self._radio_val.set(1) # OK button to start game self._sett_button = Tk.Button(text="OK", command=self._start_game) self._sett_button.grid(columnspan=3) # Bind return key to start game self.bind('<Return>', self._start_game) self._p = Pinyin() self._f = ReadingFactory() def _get_file(self): # open dialogue to chose datafile my_file = askopenfilename() # update label to show filename in gui self._filename_value.set(os.path.basename(my_file)) def _start_game(self, *args): # get filname self._datafile = os.path.join( os.path.dirname(os.path.abspath(__file__)), self._filename_value.get()) # get number of cards self._n_cards = int(self._sett_entry.get()) # remove previous gui components self._sett_fn_label.destroy() self.browse.destroy() self._reset_button.destroy() self._sett_label.destroy() self._sett_entry.destroy() self._sett_button.destroy() self._radio1.destroy() self._radio2.destroy() self._save_reminder = 0 # load in data file try: self._vocTot = pd.read_excel(self._datafile) except: tkMessageBox.showinfo("Error", "File not found!", icon='warning') self._restart() #pdb.set_trace() # get indices of all filled cards filled_idx = self._vocTot[self._vocTot['Learned'] == 0].index.tolist() del filled_idx[0] # shuffle indices self._renew_index(filled_idx) # setup new gui self._setup_game_gui() # start with first question self._show_next_question() def _setup_game_gui(self): # labels for chinese symbols self.C_labels = [] # labels for questions self.Q_labels = [] # label for correct solution self._sol_label_value = Tk.StringVar() self._sol_label = Tk.Label(textvariable=self._sol_label_value) self._sol_label.grid(row=3, column=2) # entry field for answer self._entry_value = Tk.StringVar() self._entry = Tk.Entry(textvariable=self._entry_value, font=self._default_font) self._entry.grid(row=4, column=2) self._entry.focus_set() self.bind('<Return>', self._check_answer) # Check button self._check_button = Tk.Button(text="Check", command=self._check_answer) self._check_button.grid(row=1, column=1, sticky=Tk.W) # Save button self._save_button = Tk.Button(text="Save", command=self._save) self._save_button.grid(row=2, column=1, sticky=Tk.W) # Next button self._next_button = Tk.Button(text="Next", command=self._show_next_question) self._next_button.grid(row=3, column=1, sticky=Tk.W) # New button self._new_button = Tk.Button(text="New", command=self._restart) self._new_button.grid(row=4, column=1, sticky=Tk.W) # translate field self._tr_value = Tk.StringVar() self._tr = Tk.Entry(textvariable=self._tr_value, font=self._default_font) self._tr.grid(row=5, column=2) self._tr_button = Tk.Button(text="E-C", command=self._translate) self._tr_button.grid(row=5, column=1, sticky=Tk.W) # initialize list of wrong cards self._wrong_indices = [] # initialize current index self._no = 0 def _renew_index(self, indices): # TODO: catch too many cards chosen as input shuffle(indices) # take the first n cards self._indices = indices[0:self._n_cards] def _show_next_question(self): try: # get the next index in the list self._no = self._indices.pop(0) # empty entry field self._entry_value.set("") # empty Q and C labels for i in self.C_labels: i.destroy() for i in self.Q_labels: i.destroy() i = 1 self.C_labels = [] self.Q_labels = [] # loop over Chinese characters for char in self._vocTot.C[self._no]: my_pinyin = self._p.get_pinyin(char, ' ') self.C_labels.append(Tk.Label(text=char)) self.C_labels[-1].grid(row=2, column=i + 1) to_tone = (to_tone_number(my_pinyin)) if "1" in to_tone: self.C_labels[-1].config(fg='red') elif "2" in to_tone: self.C_labels[-1].config(fg='green') elif "3" in to_tone: self.C_labels[-1].config(fg='blue') elif "4" in to_tone: self.C_labels[-1].config(fg='purple') else: self.C_labels[-1].config(fg='grey') if self._radio_val.get() == 1: self.Q_labels.append(Tk.Label(text=my_pinyin)) self.Q_labels[-1].grid(row=1, column=i + 1) i += 1 if self._radio_val.get() == 1: self._curr_ans = self._vocTot.E[self._no].encode('utf-8') elif self._radio_val.get() == 2: try: my_english = self._vocTot.E_long[self._no].encode('utf-8') except: my_english = self._vocTot.E[self._no].encode('utf-8') self.Q_labels.append(Tk.Label(text=my_english)) self.Q_labels[-1].grid(row=1, column=2, columnspan=i - 1) self._curr_ans = self._p.get_pinyin(self._vocTot.C[self._no], ' ') self._entry.grid(row=4, column=2, columnspan=i - 1) # set real_correct to default value of yes self._real_correct = 1 except IndexError: # start new round, when no card in list left self._new_round() def _check_answer(self, *args): # derive input answer = self._entry_value.get().strip().lower() # convert numbers, if provided, to pinjin tone mark tone = 0 if any(char.isdigit() for char in answer): tone = 1 answer = self._f.convert(answer, 'Pinyin', 'Pinyin', sourceOptions={ 'toneMarkType': 'numbers' }).encode('utf-8') # derive expected answer # ask for English word if self._radio_val.get() == 1: answer_to_check = self._curr_ans.encode('utf-8').lower() # ask for Chinese word elif self._radio_val.get() == 2: # pinyin with tone marks if tone == 1: answer_to_check = self._p.get_pinyin( self._vocTot.C[self._no], ' ').encode('utf-8').lower() # pinyin without tone marks else: answer_to_check = self._p.get_pinyin( self._vocTot.C[self._no], ' ').encode('utf-8').lower() # check if answer is correct if answer == answer_to_check: # if correct: move to 'learned' columns # move columns if card was correct on first attempt if self._real_correct: self._vocTot.Learned[self._no] = 1 self._save_reminder = 1 self._sol_label_value.set("") # if correct, go on to next card self._show_next_question() else: # if wrong: self._real_correct = 0 # store index in list of wrong cards self._wrong_indices.append(self._no) # display correct answer self._sol_label_value.set(self._curr_ans) self._sol_label.grid(row=3, column=2, columnspan=len(self.C_labels)) # clear entry field self._entry_value.set("") def _new_round(self): # if wrong cards still left, start new round if self._wrong_indices: # empty all display fields self._sol_label_value.set("New round!") self._renew_index(self._wrong_indices) # clear list of wrong indices self._wrong_indices = [] # start new round with the next question self._show_next_question() else: # if no wrong cards left, finish the session self._exit() def _reset_list(self): self._datafile = os.path.join( os.path.dirname(os.path.abspath(__file__)), self._filename_value.get()) my_file = pd.read_excel(self._datafile) my_file['Learned'] = 0 writer = ExcelWriter(self._datafile) #pdb.set_trace() my_file.to_excel(writer, 'Sheet1', index=False) writer.save() def _exit(self): try: self._save() except: self._sol_label_value.set("Didn't work?!") return self._sol_label_value.set("") self._entry_value.set("") self._sol_label_value.set("Done!") self._check_button['state'] = 'disabled' self._save_button['state'] = 'disabled' self.unbind('<Return>') self._next_button['state'] = 'disabled' def _save(self, *args): writer = ExcelWriter(self._datafile) self._vocTot.to_excel(writer, 'Sheet1', index=False) writer.save() self._sol_label_value.set("Saved!") self._sol_label.grid(row=3, column=2, columnspan=len(self.C_labels)) self._save_reminder = 0 def _restart(self): if self._save_reminder: result = tkMessageBox.askquestion("Warning", "Save before exiting?") if result == 'yes': try: self._save() except: self._sol_label_value.set("Didn't work?!") return python = sys.executable os.execl(python, python, *sys.argv) def _translate(self): to_translate = self._tr_value.get() if isinstance(to_translate, unicode): # Todo: doesn't work url = 'https://translate.google.com/#zh-CN/en/' + to_translate else: to_translate = to_translate.replace(' ', '%20') url = 'https://translate.google.com/#en/zh-CN/' + to_translate webbrowser.open(url)
from cjklib.reading import ReadingFactory f = ReadingFactory() [ 'GR', 'Pinyin', 'WadeGiles', 'MandarinBraille', 'MandarinIPA', 'ShanghaineseIPA', #'Hangul', #'Kana', 'Hiragana', 'Katakana', 'CantoneseYale', 'CantoneseIPA', 'Jyutping' ] DConv = { # Mandarin conversions ('cmn_Latn|Gwoyeu Romatzyh', 'cmn_Latn|x-Pinyin'): lambda s: f.convert(s, 'GR', 'Pinyin'), ('cmn_Latn|Gwoyeu Romatzyh', 'cmn_Latn|Wade-Giles'): lambda s: f.convert(s, 'GR', 'WadeGiles'), ('cmn_Latn|Gwoyeu Romatzyh', 'cmn_Latn|Braille'): lambda s: f.convert(s, 'GR', 'MandarinBraille'), ('cmn_Latn|Gwoyeu Romatzyh', 'cmn_Latn|Alternative IPA'): lambda s: f.convert(s, 'GR', 'MandarinIPA'), ('cmn_Latn|Numeric Pinyin', 'cmn_Latn|x-Pinyin'): lambda s: f.convert(s, 'Pinyin', 'Pinyin', sourceOptions={ 'toneMarkType': 'numbers' }), ('cmn_Latn|Numeric Pinyin', 'cmn_Latn|Gwoyeu Romatzyh'): lambda s: f.convert(s, 'Pinyin', 'GR', sourceOptions={ 'toneMarkType': 'numbers' }), ('cmn_Latn|Numeric Pinyin', 'cmn_Latn|Wade-Giles'): lambda s: f.convert(s, 'Pinyin', 'WadeGiles', sourceOptions={ 'toneMarkType': 'numbers' }),
def __init__(self, *args, **kwargs): Tk.Tk.__init__(self, *args, **kwargs) self.title("Ntrain") # place window in the center self.eval('tk::PlaceWindow %s center' % self.winfo_pathname(self.winfo_id())) self._default_font = tkFont.nametofont("TkDefaultFont") self._default_font.configure(size=30) # define default dataset self._defaultfile = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'chinese100.xlsx') # load default filename into label basename = os.path.basename(self._defaultfile) self._filename_value = Tk.StringVar() self._sett_fn_label = Tk.Entry(textvariable=self._filename_value, font=self._default_font, width=12) self._filename_value.set(basename) self._sett_fn_label.grid(row=1, column=0, sticky=Tk.W) # button to browse for datafile self.browse = Tk.Button(self, text="Browse", command=self._get_file) self.browse.grid(row=1, column=1, sticky=Tk.W) # OK button to start game self._reset_button = Tk.Button(text="Reset", command=self._reset_list) self._reset_button.grid(row=1, column=2) # label self._sett_label = Tk.Label(text="Number of Cards:") self._sett_label.grid(row=2, column=0, sticky=Tk.E) # entry field for number of cards entryText = Tk.StringVar() self._sett_entry = Tk.Entry(textvariable=entryText, font=self._default_font, width=3) entryText.set("30") self._sett_entry.grid(row=2, column=1, sticky=Tk.W) self._sett_entry.focus_set() # reverse option self._radio_val = Tk.IntVar() self._radio1 = Tk.Radiobutton(text="Ch to E", variable=self._radio_val, value=1) self._radio1.grid(row=4, column=0) self._radio2 = Tk.Radiobutton(text="E to Ch", variable=self._radio_val, value=2) self._radio2.grid(row=4, column=1) self._radio_val.set(1) # OK button to start game self._sett_button = Tk.Button(text="OK", command=self._start_game) self._sett_button.grid(columnspan=3) # Bind return key to start game self.bind('<Return>', self._start_game) self._p = Pinyin() self._f = ReadingFactory()
Radical 9 9 4EBA man rén Radical 30 30 53E3 mouth kǒu Radical 61 61 5FC3 heart xīn Radical 3 3 4E36 dot zhù Radical 4 4 4E3F slash piě Radical 5 5 4E59 second, fishing hook yǐ Radical 6 6 4E85 hook jué Radical 7 7 4E8C two èr Radical 8 8 4EA0 lid, head tóu Radical 10 10 513F legs ér Radical 11 11 5165 enter rù Radical 12 12 516B eight bā Radical 140 140 8278 grass cǎo Radical 24 24 5341 ten shí Radical 13 13 5182 wide jiōng Radical 14 14 5196 cover mī Radical 15 15 51AB ice bīng """ from cjklib.reading import ReadingFactory f = ReadingFactory() for line in entries.split('\n'): if not line.strip(): continue _, radicalIdx, _, meaning, pinyin = line.strip('\t').split('\t') pinyinNumbers = f.convert(pinyin, 'Pinyin', 'Pinyin', targetOptions={'toneMarkType': 'numbers'}) print '%(idx)d,"%(pinyin)s","%(meaning)s"' \ % {'meaning': meaning, 'idx': int(radicalIdx), 'pinyin': pinyinNumbers}
class CharacterLookupReadingMethodsTest(CharacterLookupTest, unittest.TestCase): """ Runs consistency checks on the reading methods of the :class:`~cjklib.characterlookup.CharacterLookup` class. .. todo:: * Impl: include script table from Unicode 5.2.0 to get character ranges for Hangul and Kana """ DIALECTS = {} SPECIAL_ENTITY_LIST = {} def setUp(self): CharacterLookupTest.setUp(self) self.f = ReadingFactory(dbConnectInst=self.db) def testReadingMappingAvailability(self): """ Test if the readings under ``CharacterLookup.CHARARACTER_READING_MAPPING`` are available for conversion. """ # mock to simulate availability of all tables in # characterLookup.CHARARACTER_READING_MAPPING tables = [table for table, _ \ in self.characterLookup.CHARARACTER_READING_MAPPING.values()] self.characterLookup.db.engine = EngineMock( self.characterLookup.db.engine, mockTables=tables) for reading in self.characterLookup.CHARARACTER_READING_MAPPING: # only if table exists table, _ = self.characterLookup.CHARARACTER_READING_MAPPING[reading] self.assert_( self.characterLookup.hasMappingForReadingToCharacter(reading)) self.assert_( self.characterLookup.hasMappingForCharacterToReading(reading)) # test proper checking for all known readings for reading in self.f.getSupportedReadings(): self.assert_( self.characterLookup.hasMappingForReadingToCharacter(reading) \ in [True, False]) self.assert_( self.characterLookup.hasMappingForCharacterToReading(reading) \ in [True, False]) @attr('slow') def testGetCharactersForReadingAcceptsAllEntities(self): """Test if ``getCharactersForReading`` accepts all reading entities.""" for reading in self.f.getSupportedReadings(): if not self.characterLookup.hasMappingForReadingToCharacter( reading): continue dialects = [{}] if reading in self.DIALECTS: dialects.extend(self.DIALECTS[reading]) for dialect in dialects: if hasattr(self.f.getReadingOperatorClass(reading), 'getReadingEntities'): entities = self.f.getReadingEntities(reading, **dialect) elif reading in self.SPECIAL_ENTITY_LIST: entities = self.SPECIAL_ENTITY_LIST[reading] else: continue for entity in entities: try: results = self.characterLookup.getCharactersForReading( entity, reading, **dialect) self.assertEquals(type(results), type([]), "Method getCharactersForReading() doesn't return" \ + " a list for entity %s " % repr(entity) \ + ' (reading %s, dialect %s)' % (reading, dialect)) for entry in results: self.assertEquals(len(entry), 1, "Entry %s in result for %s has length != 1" \ % (repr(entry), repr(entity)) \ + ' (reading %s, dialect %s)' \ % (reading, dialect)) except exception.UnsupportedError: pass except exception.ConversionError: pass
def setUp(self): CharacterLookupTest.setUp(self) self.f = ReadingFactory(dbConnectInst=self.db)
def runTests(tests, databases, registerUnicode, iteration=10): f = ReadingFactory() timing = {} for no in tests: print("Running test %d (reading from %s)..." % (no, databases[no])) connection = { 'sqlalchemy.url': 'sqlite:///%s' % databases[no], 'attach': ['cjklib'], 'registerUnicode': registerUnicode[no] } db = dbconnector.getDBConnector(connection) availableDicts = [dictClass.DICTIONARY_TABLE for dictClass in dictionary.BaseDictionary\ .getAvailableDictionaries(db)] dictionaries = list( set(availableDicts) & set(db.engine.table_names(schema=db._mainSchema))) if not dictionaries: raise ValueError("No dictionaries found") print("Found dictionaries '%s'" % "', '".join(dictionaries)) runTime = {} for dictName in dictionaries: dictClass = dictionary.BaseDictionary.getDictionaryClass(dictName) dictInstance = dictClass(dbConnectInst=db) opClass = (dictClass.READING and f.getReadingOperatorClass(dictClass.READING)) if hasattr(opClass, 'guessReadingDialect'): requestList = [] for request in SEARCH_REQUESTS: options = opClass.guessReadingDialect(request) requestList.append((request, options)) else: requestList = [(request, {}) for request in SEARCH_REQUESTS] mod = imp.new_module('timeit_runmod') mod.runRequest = runRequest mod.dictInstance = dictInstance mod.requestList = requestList sys.modules['timeit_runmod'] = mod methodTime = {} for method in ('getFor', 'getForHeadword', 'getForReading', 'getForTranslation'): t = Timer( """timeit_runmod.runRequest( timeit_runmod.dictInstance, timeit_runmod.requestList, method='%s') """ % method, "import timeit_runmod") methodTime[method] = t.timeit(iteration) runTime[dictName] = methodTime timing[no] = runTime return timing
class CharacterLookupReadingMethodsTest(CharacterLookupTest, unittest.TestCase): """ Runs consistency checks on the reading methods of the :class:`~cjklib.characterlookup.CharacterLookup` class. .. todo:: * Impl: include script table from Unicode 5.2.0 to get character ranges for Hangul and Kana """ DIALECTS = {} SPECIAL_ENTITY_LIST = {} def setUp(self): CharacterLookupTest.setUp(self) self.f = ReadingFactory(dbConnectInst=self.db) def testReadingMappingAvailability(self): """ Test if the readings under ``CharacterLookup.CHARARACTER_READING_MAPPING`` are available for conversion. """ # mock to simulate availability of all tables in # characterLookup.CHARARACTER_READING_MAPPING tables = [table for table, _ \ in list(self.characterLookup.CHARARACTER_READING_MAPPING.values())] self.characterLookup.db.engine = EngineMock( self.characterLookup.db.engine, mockTables=tables) for reading in self.characterLookup.CHARARACTER_READING_MAPPING: # only if table exists table, _ = self.characterLookup.CHARARACTER_READING_MAPPING[ reading] self.assertTrue( self.characterLookup.hasMappingForReadingToCharacter(reading)) self.assertTrue( self.characterLookup.hasMappingForCharacterToReading(reading)) # test proper checking for all known readings for reading in self.f.getSupportedReadings(): self.assertTrue( self.characterLookup.hasMappingForReadingToCharacter(reading) \ in [True, False]) self.assertTrue( self.characterLookup.hasMappingForCharacterToReading(reading) \ in [True, False]) @attr('slow') def testGetCharactersForReadingAcceptsAllEntities(self): """Test if ``getCharactersForReading`` accepts all reading entities.""" for reading in self.f.getSupportedReadings(): if not self.characterLookup.hasMappingForReadingToCharacter( reading): continue dialects = [{}] if reading in self.DIALECTS: dialects.extend(self.DIALECTS[reading]) for dialect in dialects: if hasattr(self.f.getReadingOperatorClass(reading), 'getReadingEntities'): entities = self.f.getReadingEntities(reading, **dialect) elif reading in self.SPECIAL_ENTITY_LIST: entities = self.SPECIAL_ENTITY_LIST[reading] else: continue for entity in entities: try: results = self.characterLookup.getCharactersForReading( entity, reading, **dialect) self.assertEqual(type(results), type([]), "Method getCharactersForReading() doesn't return" \ + " a list for entity %s " % repr(entity) \ + ' (reading %s, dialect %s)' % (reading, dialect)) for entry in results: self.assertEqual(len(entry), 1, "Entry %s in result for %s has length != 1" \ % (repr(entry), repr(entity)) \ + ' (reading %s, dialect %s)' \ % (reading, dialect)) except exception.UnsupportedError: pass except exception.ConversionError: pass
class BKRS2DB(object): """Class to convert BKRS.info dictionary into Pleco database format""" def __init__(self, configfile): super(BKRS2DB, self).__init__() #statprof.start() self.get_config(configfile) self.comma_symbols = [u',', u'﹐', ','] self.BUFFER_SIZE = 10000 self.buffer_index = 0 self.read_fab = ReadingFactory() self.cedict = CEDICT() self.cjk = characterlookup.CharacterLookup('T') self.pinyinOp = self.read_fab.createReadingOperator('Pinyin') self.charInfo = cjknife.CharacterInfo() self.last_error = {'description':'', 'match':'', 'not_match': ''} self.bad_word_index = 0 self.additional_reading = {} self.hanzi_stat = {} self.hanzi_freq = {} self.hanzi_pron_var = {} self.errors_description = { 'pinyin_not_match':'Не совпадает', 'no_pinyin':'Нет чтения', 'pinyin_have_tag_symbol':'В пиньине теги', 'pinyin_have_bad_symbol':'В пиньине плохие символы', 'pinyin_have_rus_letter':'В пиньине русские буквы', 'pinyin_have_number_symbol':'В пиньине цифры', 'word_have_alpha_symbol':'В слове alfa символы' } self.log_file = open(self.params['log_file'], 'w', 1000) if self.params['write_to_pleco_db']: self.pleco = Pleco(self.params['output_pleco_database_file'], self) self.bad_hanzi_list = False def export(self): if self.params['write_to_db']: self.conn = sqlite3.connect(self.params['output_database_file']) self.cursor = self.conn.cursor() self.bad_words_file = open(self.params['bad_words_file'], 'w', 1000) self.bad_words_list = open(self.params['bad_words_list'], 'w', 100) self.bad_hanzi_list = open(self.params['bad_hanzi_list'], 'w', 100) self.start_bad_words_file() self.log('Start of export. Input: '+self.params['input_bkrs_file']+', output: '+self.params['output_pleco_database_file']) self.start_time = time.time() if self.params['write_to_db']: self.create_db() self.dic = open(self.params['input_bkrs_file'], mode='r') line_type = '' word = '' pronounce = '' translate = '' word_index = 0 good_words = 0 have_no_rus_translate = 0 bad_word_not_found_pron_variant = 0 pinyin_have_number_symbol = 0 self.ambiguous_decomposition = 0 pinyin_have_tag_symbol = 0 num_pinyin_have_tone_mark = 0 no_pron_symbols_in_pinyin = 0 self.load_character_frequency() if self.params['additional_pronounces_file']: self.load_additional_pronounces() self.flog('Start work with BKRS data file...') for line in self.dic: if line == '\n': line_type = 'word' else: if not line.startswith('#'): if line_type == 'word': word_index = word_index+1 if self.params['show_progress']: self.show_progress(word_index, self.params['to_word_number']) word = (line[:-1]).strip().decode('utf-8') self.stat_words_hanzi(word) #word = self.join_nonprintable_hanzi(word) # 鱼岁 = 鱥 line_type = 'pronounce' elif line_type == 'pronounce': pronounce = (line[1:-1]).strip().decode('utf-8') line_type = 'translate' elif line_type == 'translate': if word_index <= self.params['from_word_number']: continue if self.params['to_word_number'] > 0: if word_index >= self.params['to_word_number']: break translate = (line[1:-1]).strip().decode('utf-8') translate_with_tags = translate translate_pleco = self.pleco.remove_html_tags(translate) word_info = ' line #'+str(word_index*4+1)+' word #'+str(word_index)+' word: '+word+' pinyin: '+pronounce pronounce = self.filter_pinyin(pronounce) if self.have_rus_letters(pronounce): self.log('Warning: pinyin have russian letters'+word_info) self.log_bad_word(word, pronounce, 'pinyin_have_rus_letter', translate_with_tags, word_index) self.bad_words_list.write(word.encode('utf-8')+'\t'+pronounce.encode('utf-8')+'\n') continue if self.have_number_symbol(pronounce) and not self.have_number_symbol(word): pinyin_have_number_symbol += 1 self.log('Pinyin have tone number '+word_info) self.log_bad_word(word, pronounce, 'pinyin_have_number_symbol', translate_with_tags, word_index) self.bad_words_list.write(word.encode('utf-8')+'\t'+pronounce.encode('utf-8')+'\n') continue if self.have_tag_symbol(pronounce): pinyin_have_tag_symbol +=1 self.log('Warning: pinyin have tag symbols '+word_info) self.log_bad_word(word, pronounce, 'pinyin_have_tag_symbol', translate_with_tags, word_index) self.bad_words_list.write(word.encode('utf-8')+'\t'+pronounce.encode('utf-8')+'\n') continue if not self.have_rus_letters(translate_pleco): have_no_rus_translate += 1 continue if self.have_pron_symbol(pronounce): ob_pronounce = self.convert_full_pinyin(word, pronounce) pronounce_numeric_tone = self.get_string_pron(ob_pronounce) if not pronounce_numeric_tone: self.log('Error not found pronounce variant'+word_info) bad_word_not_found_pron_variant += 1 if self.last_error['description'] != 'HANZI_WITH_NO_PRON': if self.translate_have_rus(translate_with_tags): if self.have_lat_letters_or_numbers(word): self.log_bad_word(word, pronounce, 'word_have_alpha_symbol', translate_with_tags, word_index) else: self.log_bad_word(word, pronounce, 'pinyin_not_match', translate_with_tags, word_index) self.bad_words_list.write(word.encode('utf-8')+'\t'+pronounce.encode('utf-8')+'\n') continue else: #if self.translate_have_rus(translate_with_tags): # self.log_bad_word(word, pronounce, 'no_pinyin', translate_with_tags, word_index) no_pron_symbols_in_pinyin += 1 continue if self.pinyin_have_bad_symbol(pronounce): self.log_bad_word(word, pronounce, 'pinyin_have_bad_symbol', translate_with_tags, word_index) trad_word = self.get_trad(word) if self.params['write_to_pleco_db']: self.pleco.write_db(word, trad_word, pronounce_numeric_tone, translate_pleco) self.pleco.create_db_word_index(ob_pronounce, len(word)) if self.params['write_to_db']: freq = self.get_word_freq(word) self.write_db(trad_word, word, pronounce_numeric_tone, translate, freq) self.clear_last_error() good_words += 1 self.flog('OK.. ###################################################################################') self.flog('Count of words:\t\t\t\t\t'+str(word_index)) self.flog('Good words:\t\t\t\t\t\t'+str(good_words)+'\t\t('+str(round(float(good_words)*100/word_index,2))+'%)') self.flog('Have no rus translate:\t\t\t'+str(have_no_rus_translate)+'\t\t('+str(round(float(have_no_rus_translate)*100/word_index,2))+'%)') self.flog('Not found pronounce variant: \t'+str(bad_word_not_found_pron_variant)+'\t\t('+str(round(float(bad_word_not_found_pron_variant)*100/word_index,2))+'%)') self.flog('Numeric pinyin have tone mark:\t'+str(num_pinyin_have_tone_mark)+'\t\t('+str(round(float(num_pinyin_have_tone_mark)*100/word_index,2))+'%)') self.flog('Pinyin field have tone number:\t'+str(pinyin_have_number_symbol)+'\t\t('+str(round(float(pinyin_have_number_symbol)*100/word_index,2))+'%)') self.flog('Pinyin pinyin have tag symbol:\t'+str(pinyin_have_tag_symbol)+'\t\t('+str(round(float(pinyin_have_tag_symbol)*100/word_index,2))+'%)') self.flog('Pinyin have no pron symbols: \t'+str(no_pron_symbols_in_pinyin)+'\t\t('+str(round(float(no_pron_symbols_in_pinyin)*100/word_index,2))+'%)') self.log_hanzi_stat() if self.params['write_to_pleco_db']: self.pleco.create_db_index() self.pleco.conn.commit() self.pleco.conn.close() if self.params['write_to_db']: self.create_db_index() self.conn.commit() self.conn.close() self.dic.close() self.end_time = time.time() self.flog('End of export. Total time: '+str(round(self.end_time - self.start_time ,2))+' sec') self.end_bad_words_file() self.bad_words_list.close() self.bad_hanzi_list.close() self.bad_words_file.close() #statprof.stop() #statprof.display() def __del__(self): self.log_file.close() def get_config(self, configfile): self.config = ConfigParser.ConfigParser() configPath = os.path.dirname(__file__)+'/'+configfile self.config.read(configPath) self.params = {} self.params['write_to_db'] = self.config.getboolean('Main', 'write_to_db') self.params['write_to_pleco_db'] = self.config.getboolean('Main', 'write_to_pleco_db') self.params['show_progress'] = self.config.getboolean('Main', 'show_progress') self.params['approx_count_of_words'] = self.config.getint('Main', 'approx_count_of_words') self.params['from_word_number'] = self.config.getint('Main', 'from_word_number') self.params['to_word_number'] = self.config.getint('Main', 'to_word_number') self.params['log_console'] = False self.params['input_bkrs_file'] = self.config.get('Input files', 'bkrs_db') self.params['additional_pronounces_file'] = self.config.get('Input files', 'additional_pronounces') self.params['char_freq_file'] = self.config.get('Input files', 'char_freq') self.params['log_template'] = self.config.get('Input files', 'log_template') self.params['output_pleco_database_file'] = self.config.get('Output files', 'pleco_db') self.params['output_database_file'] = self.config.get('Output files', 'sqlite_db') self.params['log_file'] = self.config.get('Output files', 'log_file') self.params['bad_words_file'] = self.config.get('Output files', 'bad_words_html') self.params['bad_words_list'] = self.config.get('Output files', 'bad_words_list') self.params['bad_hanzi_list'] = self.config.get('Output files', 'bad_hanzi_list') self.params['frequency_file'] = self.config.get('Output files', 'frequency') def log_hanzi_stat(self): frequency_file = open(self.params['frequency_file'], 'w', 100) hanzilist = [] for key, val in self.hanzi_stat.items(): hanzilist.append(val) uniquehanzi = len(hanzilist) allhanzi = 0 for h in hanzilist: allhanzi += h['count'] self.log('Hanzi statistic ################################################################################') self.log('Total hanzi: '+str(allhanzi)) self.log('Unique hanzi: '+str(uniquehanzi)) self.log('Top 100 error hanzi ############################################################################') hanzilist.sort(key=lambda x: x['error'], reverse = True) i = 0 for hanzi in hanzilist: i += 1 self.log('Hanzi: '+hanzi['hanzi']+' \t Count: '+str(hanzi['count'])+'\t\tError: '+str(hanzi['error']), with_time = False) if i>100: break self.log('Hanzi frequency ##################################################################################') hanzilist.sort(key=lambda x: x['count'], reverse = True) for hanzi in hanzilist: frequency_file.write(hanzi['hanzi'].encode('utf-8')+'\t'+str(hanzi['count'])+'\n') frequency_file.close() def get_string_pron(self, ob_pron): """Get list of pron: [pron1,pron2] pron = [(hanzi, num_pinyin, sep), ...] return string """ if not ob_pron: return '' list_pron = [] for pron in ob_pron: str_pron = '' for hanzi, num_pinyin, sep in pron: str_pron = str_pron+num_pinyin+sep list_pron.append(str_pron) num_pron = ', '.join(list_pron) return num_pron.strip() def convert_full_pinyin(self, hanziword, pinyin): clean_hanzi = self.filter_hanzi(hanziword) pinyin = self.filter_pinyin(pinyin) if self.hanziword_have_comma(hanziword): pinyin = self.replace_comma(pinyin, rep = ' ') pinyins = pinyin.split(',') if len(clean_hanzi) == 0: self.log('Error Hanzi word length is zero! Hanzi: '+hanziword+' clean Hanzi: '+clean_hanzi) self.last_error['description'] = 'NOT_CHINESE_CHARS' return False pinyins_good_results = [] for atom_pinyin in pinyins: pron = self.convert_pinyin(clean_hanzi, atom_pinyin, reverse_sort = True) if not pron: pron = self.convert_pinyin(clean_hanzi, atom_pinyin, reverse_sort = False) if pron: pinyins_good_results.append(pron) return pinyins_good_results def convert_pinyin(self, clean_hanzi, pinyin, reverse_sort = True): old_pinyin = pinyin ob_pronounce = [] for hanzi in clean_hanzi: all_pron_variants = self.get_pron_variants(hanzi) all_pron_variants_mixed = self.get_pron_variants_mixed(hanzi, reverse_sort) not_found_pron = True if not all_pron_variants: self.log('No fonded any pronounciation for hanzi: '+hanzi) self.last_error['description'] = 'HANZI_WITH_NO_PRON' self.last_error['hanzi'] = hanzi if self.bad_hanzi_list: self.bad_hanzi_list.write(hanzi.encode('utf-8')+'\n') return False for pron_var in all_pron_variants_mixed: if pinyin.startswith(pron_var): not_found_pron = False pinyin_splited = self.split_pinyin(hanzi, pron_var, pinyin) pinyin = pinyin_splited['pinyin'] ob_pronounce.append((hanzi, pinyin_splited['num_pron'], pinyin_splited['sep'])) break if not_found_pron: self.stat_add_hanzi_error(hanzi) matched_str = ' '.join('['+h+':'+p+']' for h,p,s in ob_pronounce) self.log('Not found pron for hanzi: '+hanzi+' ['+ ' '.join(s for s in all_pron_variants)+'] P1:'+old_pinyin+' P2:'+pinyin+' '+matched_str) self.last_error['match'] = matched_str self.last_error['not_match'] = hanzi+' ['+ ' '.join(s for s in all_pron_variants)+']' return False return ob_pronounce def split_pinyin(self, hanzi, pron_var, pinyin): all_separators = [u' ', u'’'] pinyin = pinyin.replace(pron_var, '', 1) separator = '' for sep in all_separators: if pinyin.startswith(sep): pinyin = pinyin.lstrip(sep) separator = sep break if not self.have_tone_mark(pron_var) and re.match(u'^[a-zA-Zα-ωΑ-Ω]$',hanzi): num_pron = pron_var else: num_pron = self.get_numeric_tone(pron_var) return {'num_pron':num_pron,'sep':separator, 'pinyin':pinyin} def get_pron_variants_mixed(self, hanzi, reverse_sort): all_pron_variants = self.get_pron_variants(hanzi) if re.match('^[0-9]$',hanzi): pron_variants = all_pron_variants else: pron_variants = self.get_with_mixed_tones(all_pron_variants, reverse_sort) return pron_variants def get_pron_variants(self, hanzi): if hanzi in self.hanzi_pron_var: return self.hanzi_pron_var[hanzi] pron_variants = [] try: pron_variants = pron_variants + self.cjk.getReadingForCharacter(hanzi, 'Pinyin') except: self.log('Error: getReadingForCharacter. Hanzi: '+hanzi) if hanzi in self.additional_reading: pron_variants = pron_variants + self.additional_reading[hanzi] unique_pron_vars = unique_list(pron_variants, lambda x: x.lower().strip()) self.hanzi_pron_var[hanzi] = unique_pron_vars return unique_pron_vars def stat_add_hanzi(self, hanzi): if hanzi in self.hanzi_stat: self.hanzi_stat[hanzi]['count'] += 1 else: self.hanzi_stat[hanzi] = {'hanzi': hanzi, 'count':1, 'error':0} def stat_add_hanzi_error(self, hanzi): if hanzi in self.hanzi_stat: self.hanzi_stat[hanzi]['error'] += 1 else: self.hanzi_stat[hanzi] = {'hanzi': hanzi, 'count':1, 'error':1} def stat_words_hanzi(self, word): for hanzi in word: self.stat_add_hanzi(hanzi) def get_with_mixed_tones(self, pron_var_list, reverse_sort = True): mixed_pron_variants = pron_var_list for pron_var in pron_var_list: none_tone_pron = self.get_without_tone_mark(pron_var) alltones = self.get_all_tones(none_tone_pron) alltones.append(none_tone_pron) mixed_pron_variants = mixed_pron_variants+alltones mixed_pron_variants = unique_list(mixed_pron_variants) mixed_pron_variants.sort(key=len, reverse = reverse_sort) return mixed_pron_variants def load_additional_pronounces(self): self.flog('Start loading additional reading database...') files = self.params['additional_pronounces_file'].split(',') for file_name in files: addreadfile = open(file_name, mode = 'r') words = 0 for line in addreadfile: line = line.replace('','') # replace one not printable symbol uline = line.strip().decode('utf-8') charhanzilist = uline.split('\t')[0].strip().split(',') readings = uline.split('\t')[1].split(',') for charhanzi in charhanzilist: if charhanzi in self.additional_reading: self.additional_reading[charhanzi] = unique_list(readings + self.additional_reading[charhanzi], lambda x: x.lower()) else: self.additional_reading[charhanzi] = readings words += 1 addreadfile.close() self.flog('Additional reading database loaded from '+file_name+'. Count of hieroglyph: '+str(words)) def load_character_frequency(self): self.flog('Start loading character frequency...') freqfile = open(self.params['char_freq_file'], mode = 'r') words = 0 for line in freqfile: uline = (line[:-1]).strip().decode('utf-8') if '\t' in uline: charhanzi = uline.split('\t')[0].strip() freq = int(uline.split('\t')[1]) if freq <= 0: freq = 1 self.hanzi_freq[charhanzi] = freq words += 1 self.flog('Characters frequency loaded. Hanzi count: '+str(words)) def get_hanzi_freq(self, hanzi): try: freq = self.hanzi_freq[hanzi] except KeyError: freq = 1 return freq def get_word_freq(self, word): freq = 0 length = len(word) if not length: return 0 for hanzi in word: freq += self.get_hanzi_freq(hanzi) freq = int(freq/length) return freq def filter_hanzi(self, hanziword): # Unicode blocks for Chinese, Japanese and Korean: #{InCJK_Compatibility}: U+3300–U+33FF #{InCJK_Unified_Ideographs_Extension_A}: U+3400–U+4DBF #{InCJK_Unified_Ideographs}: U+4E00–U+9FFF #{InCJK_Compatibility_Ideographs}: U+F900–U+FAFF #{InCJK_Compatibility_Forms}: U+FE30–U+FE4F clean_word = '' for char in re.findall(ur'[0-9a-zA-Zα-ωΑ-Ω\u3300-\u33FF\u3400-\u4DBF\u4e00-\u9fff\uF900-\uFAFF\uFE30-\uFE4F]+', hanziword): clean_word = clean_word+char clean_word = self.replace_comma(clean_word, rep = '') return clean_word
def setUp(self): CharacterLookupTest.setUp(self) self.f = ReadingFactory(dbConnectInst=self.db)
class LeoDownloader(AudioDownloader): """Download audio from LEO""" def __init__(self): AudioDownloader.__init__(self) self.file_extension = u'.mp3' self.url = 'http://www.leo.org/dict/audio_{language}/{word}.mp3' # And, yes, they use ch for Chinese. # (I'm not sure if they really have anything for ru or it.) self.language_dict = { 'de': 'de', 'en': 'en', 'es': 'es', 'fr': 'fr', 'it': 'it', 'ru': 'ru', 'zh': 'ch' } # It kind of looks like they have Swiss pronunciations, but hey don't. self.chinese_code = 'ch' # We should keep a number of site icons handy, with the right # flag for the request. self.site_icon_dict = {} self.site_file_name_encoding = 'ISO-8859-1' self.icon_url_dict = { 'de': 'http://dict.leo.org/favicon.ico', 'en': 'http://dict.leo.org/favicon.ico', 'es': 'http://dict.leo.org/favicon_es.ico', 'fr': 'http://dict.leo.org/favicon_fr.ico', 'it': 'http://dict.leo.org/favicon_it.ico', 'ru': 'http://dict.leo.org/favicon_ru.ico', # When we use this dict, we have already munged the 'zh' to 'ch' 'ch': 'http://dict.leo.org/favicon_ch.ico' } # As the name implies, a hack. Try to use the cjklib TTEMPÉ # brings along. A syntem-wide installed one should work as # well. self.have_tried_cjklib_hack = False self.reading_factory = None def download_files(self, word, base, ruby, split): """ Download a word from LEO We try to get pronunciations for the text for German, English, Spanish, French, Italian and Russian, and from the ruby for Chinese. There may not be any pronunciations available for Italian or Russian. """ self.downloads_list = [] # Fix the language. EAFP. self.language = self.language_dict[self.language[:2].lower()] # set_names also checks the language. self.set_names(word, base, ruby) if self.chinese_code == self.language and not split: return # Only get the icon when we have a word # self.maybe_get_icon() self.get_flag_icon() # EAFP. self.query_url may return None... word_url = self.query_url(word, ruby) # ... then the get_data will blow up word_data = self.get_data_from_url(word_url) word_file_path, word_file_name = self.get_file_name() with open(word_file_path, 'wb') as word_file: word_file.write(word_data) # We have a file, but not much to say about it. self.downloads_list.append( (word_file_path, word_file_name, dict(Source='Leo'))) def query_url(self, word, ruby): """Build query URL""" if self.chinese_code == self.language: word = self.fix_pinyin(ruby) return self.url.format(language=self.language, word=urllib.quote( word.encode(self.site_file_name_encoding))) def fix_pinyin(self, pinyin): # Hacks. It is overkill to ship cjklib with this add-on. But # to get the tone numbers as numbers, we should use it. My # hope (guess) is that the typical user that will want Chinese # pronunciations will also have TTEMPÉ's (version of mine) # chinese-support-plugin installed. So try to use that and # don't complain if it doesn't work. if not self.have_tried_cjklib_hack: try: # If this works, the whole shebang is run as an Anki2 # add-on. If not, we will still look for a system-wide # cjklib, but obviously not for anothre add-on. from aqt.utils import isWin except: pass else: from aqt import mw addon_dir = mw.pm.addonFolder() if isWin: # The isWin bit is copied from TTEMPÉ's code. addon_dir = addon_dir.encode(sys.getfilesystemencoding()) sys.path.append(os.path.join(addon_dir, "chinese")) self.have_tried_cjk_hack = True if not self.reading_factory: try: from cjklib.reading import ReadingFactory except ImportError: return pinyin else: self.reading_factory = ReadingFactory() return self.reading_factory.convert(pinyin, 'Pinyin', 'Pinyin', targetOptions={ 'toneMarkType': 'numbers' }).replace('5', '0') def get_flag_icon(self): """ Set self.site_icon to the right icon. We should use different icons, depending on the request language. We store these icons in self.site_icon_dict and use the AudioDownloader.maybe_get_icon() if we don't have it yet. """ if not with_pyqt: return try: # If this works we already have it. self.site_icon = self.site_icon_dict[self.language] except KeyError: # We have to get it ourself. (We know it's just 16x16, so # no resize. And we know the address). self.site_icon_dict[self.language] = \ QImage.fromData(self.get_data_from_url( self.icon_url_dict[self.language])) self.site_icon = self.site_icon_dict[self.language] def set_names(self, text, base, ruby): """ Set the display text and file base name variables. """ if self.language == self.chinese_code: if not ruby: raise ValueError('Nothing to download') self.base_name = u"{0}_{1}".format(base, ruby) self.display_text = u"{1} ({0})".format(base, ruby) else: if not text: raise ValueError('Nothing to download') self.base_name = text self.display_text = text
def __init__(self, **options): super(ChineseLessonsComMandarinPronunciation, self).__init__(**options) self.readingFactory = ReadingFactory()
def handle_noargs(self, **options): # 一事無成 一事无成 [yi1 shi4 wu2 cheng2] /to have achieved nothing/to be a total failure/to get nowhere/ # EMPTY ALL ZH + PY KEYS self._del_keys('ZH:*') self._del_keys('PY:*') # NOW LETS START file = open(settings.DICT_FILE_LOCATION) item_count = 0 for line in file: if line.startswith("#"): pass else: # OPEN REDIS CONNECTION NOW r_server = _get_redis() # GATHER ALL THE MAIN VARIABLES new = line.split() numbered_pinyin = line[(line.index('[')+1):(line.index(']'))] f = ReadingFactory() tonal_pinyin = f.convert(numbered_pinyin, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'v', 'missingToneMark': 'fifth'}) meanings = line[(line.index('/')+1):(line.rindex('/'))] characters = new[1] # REMOVE ALL THE UGLY CHARACTERS if ',' in characters: characters = characters.replace(',', '') # GET AND CLEAN THE MEASURE WORD mws = None if "CL:" in meanings: new_meanings = meanings.split('/') for idx, val in enumerate(new_meanings): if "CL:" in val: mws = [] for x in val.replace('CL:', '').split(','): x = x[:(x.index('['))] if '|' in x: x = x[(x.index('|')+1):] # ADD THE MEAASURE WORDS ENTRY # ---------------------------- mws_key = settings.MEASURE_WORD_KEY % x if r_server.exists(mws_key): values = json.loads(_search_redis(mws_key)) values['chars'].append(characters) else: values = {'chars': [characters,]} r_server.set(mws_key, json.dumps(values)) mws.append(x) new_meanings.pop(idx) meanings = "/".join(new_meanings) char_key = settings.CHINESE_WORD_KEY % ((len((characters))/3), characters) # CREATE THE PRONUNCIATION/MEANING PAIR pair = {} pair['pinyin'] = tonal_pinyin pair['pinyin_numbered'] = _normalize_pinyin(numbered_pinyin) pair['meaning'] = meanings pair['measure_words'] = mws # ADD THE PINYIN ENTRY # -------------------- py_key = settings.PINYIN_WORD_KEY % _pinyin_to_ascii(numbered_pinyin) if r_server.exists(py_key): values = json.loads(_search_redis(py_key)) if smart_unicode(characters) not in values: values.append(characters) else: values = [characters,] r_server.set(py_key, json.dumps(values)) # ADD THE CHINESE CHARACTER ENTRY # ------------------------------- if r_server.exists(char_key): values = json.loads(_search_redis(char_key)) values['meanings'].append(pair) else: values = { 'chars': characters, 'meanings': [pair,], } r_server.set(char_key, json.dumps(values)) item_count += 1 print item_count print "%s Chinese items added" % item_count file.close()
def main(): language, output_encoding = locale.getdefaultlocale() if len(sys.argv) == 2: modus = sys.argv[1] if modus not in modi: print "invalid modus, choose one out of: " + ", ".join(modi.keys()) sys.exit(1) else: print "give a modus, choose one out of: " + ", ".join(modi.keys()) sys.exit(1) fromReading, toReading, entryFunc, readingOpt = modi[modus] initialRules = INITIAL_RULES[(fromReading, toReading)] finialRules = FINAL_RULES[(fromReading, toReading)] extraSyllables = EXTRA_SYLLABLES[(fromReading, toReading)] # entry set global entrySet entrySet = set() # build table and use scheme with almost perfect grouping according to # pronunciation, then use headers to get the initial's and final's # pronunciation. op = ReadingFactory().createReadingOperator(fromReading, **readingOpt) # get splitted syllables, finals in first row, initials in first column for syllable in op.getReadingEntities(): initial, final = op.getOnsetRhyme(syllable) # only apply rules if syllable isn't given an extra mapping in # EXTRA_SYLLABLES if not syllable in extraSyllables: # check if we have rules if initialRules[initial] != None and finialRules[final] != None: # check for ambiguous mappings if type(initialRules[initial]) == type({}): initialFeatures = initialRules[initial].keys() else: initialFeatures = [None] if type(finialRules[final]) == type({}): finalFeatures = finialRules[final].keys() else: finalFeatures = [None] # go through all mappings for initialFeature in initialFeatures: for finalFeature in finalFeatures: if initialFeature: targetInitial \ = initialRules[initial][initialFeature] else: targetInitial = initialRules[initial] if finalFeature: targetFinal = finialRules[final][finalFeature] else: targetFinal = finialRules[final] entry = entryFunc(syllable, targetInitial, targetFinal, initialFeature, finalFeature) if entry != None: entrySet.add(entry) else: print >> sys.stderr, ("missing rule(s) for syllable '" \ + syllable + "' with initial/final '" + initial + "'/'" \ + final + "'").encode(output_encoding) # print extra syllables for syllable in extraSyllables: if extraSyllables[syllable]: initialRule, finalRule = extraSyllables[syllable] # check for ambiguous mappings if type(initialRule) == type({}): initialFeatures = initialRule.keys() else: initialFeatures = [None] if type(finalRule) == type({}): finalFeatures = finalRule.keys() else: finalFeatures = [None] # go through all mappings for initialFeature in initialFeatures: for finalFeature in finalFeatures: if initialFeature: targetInitial = initialRule[initialFeature] else: targetInitial = initialRule if finalFeature: targetFinal = finalRule[finalFeature] else: targetFinal = finalRule entry = entryFunc(syllable, targetInitial, targetFinal, initialFeature, finalFeature) if entry != None: entrySet.add(entry) notIncludedSyllables = [syllable for syllable in extraSyllables \ if not extraSyllables[syllable]] if notIncludedSyllables: print >> sys.stderr, ("Syllables not included in table: '" \ + "', '".join(sorted(notIncludedSyllables)) + "'")\ .encode(output_encoding) entryList = list(entrySet) entryList.sort() print "\n".join(entryList).encode(output_encoding)