def convertToKana(self): inputLen = len(self.lookup.text()) if inputLen > 0: if scripts.script_type(self.lookup.text()) == scripts.Script.Kanji: pass #TODO: ... #if re.search('n{1}', self.lookup.text()[ inputLen - 2: ]) is None: #NB: yes, regexp would be better, yet I failed miserably at it if self.lookup.text()[ inputLen - 1 ] != u'n' and self.lookup.text()[ inputLen - 2:] != u'ny': converted = romkan(self.lookup.text()) #NB: does not convert naninuneno, somehow (purpotedly, 'n' normalization is to blame) self.lookup.setText(converted) #self.testConvert.setText(converted) if self.lookup.text()[ inputLen - 2:] == u'nn': converted = romkan(normalize_double_n(self.lookup.text())) self.lookup.setText(converted) #print self.lookup.text() #scripts.script_type(cluster) == scripts.Script.Kanji: if len(scripts.script_boundaries(self.lookup.text())) == 1: if scripts.script_type(self.lookup.text()) == scripts.Script.Hiragana: self.updateLookupResults(self.lookup.text()) else: self.lookupResults.clearContents() self.lookupResults.setRowCount(0)
def _parse_line(self, line): "Parses a single line in the kanjdic file, returning an entry." segment_pattern = re.compile('[^ {]+|{.*?}', re.UNICODE) segments = segment_pattern.findall(line.strip()) segments.reverse() kanji = segments.pop() jis_code = int(segments.pop(), 16) info = { 'kanji': kanji, 'gloss': [], 'on_readings': [], 'kun_readings': [], 'jis_code': jis_code } while segments: s = segments.pop() if s.startswith('{'): info['gloss'].append(s[1:-1]) elif (scripts.script_type(s) != scripts.Script.Ascii or s.startswith('-')): # It must be a reading. char = s[0] if char == '-': char = s[1] if scripts.script_type(char) == scripts.Script.Katakana: info['on_readings'].append(s) elif scripts.script_type(char) == scripts.Script.Hiragana: info['kun_readings'].append(s) else: raise Exception("Unknown segment %s" % s) elif s in ('T1', 'T2'): continue else: # handle various codes code = s[0] remainder = s[1:] try: remainder = int(remainder) except: pass info.setdefault(remappings.get(code, code), []).append(remainder) info['stroke_count'] = info['stroke_count'][0] if 'frequency' in info: info['frequency'] = info['frequency'][0] info['skip_code'] = tuple( int(i) for i in info['skip_code'][0].split('-')) return KanjidicEntry(**info)
def _parse_line(self, line): "Parses a single line in the kanjdic file, returning an entry." segment_pattern = re.compile('[^ {]+|{.*?}', re.UNICODE) segments = segment_pattern.findall(line.strip()) segments.reverse() kanji = segments.pop() jis_code = int(segments.pop(), 16) info = { 'kanji': kanji, 'gloss': [], 'on_readings': [], 'kun_readings': [], 'jis_code': jis_code } while segments: s = segments.pop() if s.startswith('{'): info['gloss'].append(s[1:-1]) elif (scripts.script_type(s) != scripts.Script.Ascii or s.startswith('-')): # It must be a reading. char = s[0] if char == '-': char = s[1] if scripts.script_type(char) == scripts.Script.Katakana: info['on_readings'].append(s) elif scripts.script_type(char) == scripts.Script.Hiragana: info['kun_readings'].append(s) else: raise Exception("Unknown segment %s" % s) elif s in ('T1', 'T2'): continue else: # handle various codes code = s[0] remainder = s[1:] try: remainder = int(remainder) except: pass info.setdefault(remappings.get(code, code), []).append( remainder) info['stroke_count'] = info['stroke_count'][0] if 'frequency' in info: info['frequency'] = info['frequency'][0] info['skip_code'] = tuple(int(i) for i in info['skip_code'][0].split('-')) return KanjidicEntry(**info)
def sample_seq_n_uniform(self, condition_segments, n, exclude_set=None): # XXX Note: potential for infinite recursion if not enough candidates # are available. Much less likely to hit this case than non-uniform # sampling. exclude_set = set(exclude_set or []) results = [] kanji_script = scripts.Script.Kanji while len(results) < n: result_seg_sets = [] for segment in condition_segments: if scripts.script_type(segment) == kanji_script: result_seg_sets.append( [o['symbol'] for o in self.density.filter( condition=segment).order_by('?').values( 'symbol')[:n]] ) else: result_seg_sets.append([segment] * n) for result_segs in zip(*result_seg_sets): flat_result = ''.join(result_segs) if flat_result not in exclude_set: results.append(result_segs) exclude_set.add(flat_result) return results
def canonical_segment_forms(segment, left_context=True, right_context=True): """ When given a single segment, determine all possible canonical forms for that segment, assuming that both sequential voicing and sound euphony were possible (i.e. that the segment had both left and right context). """ table = kana_table.KanaTable.get_cached() variants = set([segment]) stype = scripts.script_type(segment) sokuon = _sokuon_map.get(stype, None) onbin = _onbin_map.get(stype, None) if sokuon is None: raise ValueError('Unsupported script type. ' 'Segments must be hiragana or katakana') if right_context and len(segment) > 1 and segment.endswith(sokuon): # Can restore onbin cases variants.update([segment[:-1] + c for c in onbin]) if left_context and table.is_voiced(segment[0]): # Can devoice variants.update([from_voiced[v[0]] + v[1:] for v in variants]) return variants
def parseWordToKanji(self): script = scripts.script_boundaries(self.itemsMenu.wordInfo.text()) components = u'' kanjiList = [] for cluster in script: if scripts.script_type(cluster) == scripts.Script.Kanji: for kanji in cluster: if not kanji in kanjiList: kanjiList.append(kanji) try: lookup = self.kdict[kanji] kun = lookup.kun_readings; on = lookup.on_readings; gloss = lookup.gloss components += '<b>(' + kanji + ')</b>\t' #components += '(' + kanji + ')\t' if len(kun) > 0: components += '<b>kun:</b>' + ', '.join(kun) + '\t' if len(on) > 0: components += '<b>on:</b>' + ', '.join(on) + '<br/>' if len(gloss) > 0: components += "<font style='font-family: Calibri; font-size: 11pt'>" + ", ".join(gloss) + "</font><br/>" except: components += kanji + '<br/>' return components.rstrip('<br/>')
def sample_seq_n_uniform(self, condition_segments, n, exclude_set=None): # XXX Note: potential for infinite recursion if not enough candidates # are available. Much less likely to hit this case than non-uniform # sampling. exclude_set = set(exclude_set or []) results = [] kanji_script = scripts.Script.Kanji while len(results) < n: result_seg_sets = [] for segment in condition_segments: if scripts.script_type(segment) == kanji_script: result_seg_sets.append([ o['symbol'] for o in self.density.filter(condition=segment). order_by('?').values('symbol')[:n] ]) else: result_seg_sets.append([segment] * n) for result_segs in zip(*result_seg_sets): flat_result = ''.join(result_segs) if flat_result not in exclude_set: results.append(result_segs) exclude_set.add(flat_result) return results
def lookup(self, query): found = None if self.config.ignore_kana(): if len(scripts.script_types(query)) is 1: if scripts.script_type(query) is scripts.Script.Hiragana or scripts.script_type(query) is scripts.Script.Katakana: return found try: found = self.edict[query] if self.config.ignore_duplicates(): if found.word in self.stats: found = None else: self.stats.append(found.word) except KeyError: if query not in self.missed: self.missed.append(query) finally: return found
def onbin_variants(kana_segment): """ Determine the sound euphony variants of a kana segment. """ variants = set([kana_segment]) if len(kana_segment) > 1: sokuon = _sokuon_map[scripts.script_type(kana_segment)] variants.add(kana_segment[:-1] + sokuon) return variants
def lookup(self, query): found = None if self.config.ignore_kana(): if len(scripts.script_types(query)) is 1: if scripts.script_type( query ) is scripts.Script.Hiragana or scripts.script_type( query) is scripts.Script.Katakana: return found try: found = self.edict[query] if self.config.ignore_duplicates(): if found.word in self.stats: found = None else: self.stats.append(found.word) except KeyError: if query not in self.missed: self.missed.append(query) finally: return found
def sample_seq_n(self, condition_segments, n, exclude_set=None): dists = [] kanji_script = scripts.Script.Kanji for segment in condition_segments: if scripts.script_type(segment) == kanji_script: seg_dist = ProbDist.from_query_set(self.density.filter( condition=segment)) dists.append(seg_dist) else: dists.append(segment) return SeqDist(*dists).sample_n(n, exclude_set)
def sample_seq_n(self, condition_segments, n, exclude_set=None): dists = [] kanji_script = scripts.Script.Kanji for segment in condition_segments: if scripts.script_type(segment) == kanji_script: seg_dist = ProbDist.from_query_set( self.density.filter(condition=segment)) dists.append(seg_dist) else: dists.append(segment) return SeqDist(*dists).sample_n(n, exclude_set)
def _get_kanji(): """Fetches our canonical list of kanji to work with.""" if not hasattr(_get_kanji, '_cached'): kanji_set = set() with codecs.open(settings.STROKE_SOURCE, 'r', 'utf8') as istream: for line in istream: kanji, rest = line.split() # check for a kanji or hanzi; our Chinese data extends into # the E000-F8FF private use block, so an "Unknown" script is # ok too assert len(kanji) == 1 and scripts.script_type(kanji) in \ (scripts.Script.Kanji, scripts.Script.Unknown) kanji_set.add(kanji) _get_kanji._cached = kanji_set return _get_kanji._cached
def process_response(self, request, response): if response.status_code != 200: return response if not response.get('Content-Type', '').startswith('text/html'): return response content = response.content.decode('utf8') if not scripts.script_types(content).intersection( self.japanese_scripts): return response parts = [] for part in scripts.script_boundaries(content): if scripts.script_type(part) in self.japanese_scripts: parts.append('<span lang="ja" xml:lang="ja">%s</span>' % part) else: parts.append(part) response.content = u''.join(parts).encode('utf8') return response
def expand_long_vowels(kana_string): """ Expands whatever long vowels are possible to expand. >>> a = expand_long_vowels(u'すー') >>> b = u'すう' >>> a == b True """ script_converters = { scripts.Script.Hiragana: lambda x: x, scripts.Script.Katakana: scripts.to_katakana } table = kana_table.KanaTable.get_cached() out_string = '' for segment in scripts.script_boundaries(kana_string): if len(segment): char_type = scripts.script_type(segment) if char_type not in script_converters: out_string += segment continue reverse_operation = script_converters[char_type] segment = scripts.to_hiragana(segment) else: continue for m in _long_finder.finditer(segment): i = m.start() vowel = table.to_vowel_line(segment[i - 1]) segment = segment[:i] + vowel + segment[i + 1:] out_string += reverse_operation(segment) return out_string
def expand_long_vowels(kana_string): """ Expands whatever long vowels are possible to expand. >>> a = expand_long_vowels(u'すー') >>> b = u'すう' >>> a == b True """ script_converters = {scripts.Script.Hiragana: lambda x: x, scripts.Script.Katakana: scripts.to_katakana} table = kana_table.KanaTable.get_cached() out_string = '' for segment in scripts.script_boundaries(kana_string): if len(segment): char_type = scripts.script_type(segment) if char_type not in script_converters: out_string += segment continue reverse_operation = script_converters[char_type] segment = scripts.to_hiragana(segment) else: continue for m in _long_finder.finditer(segment): i = m.start() vowel = table.to_vowel_line(segment[i-1]) segment = segment[:i] + vowel + segment[i+1:] out_string += reverse_operation(segment) return out_string
def sift_nonj_characters(data, plain): parts = scripts.script_boundaries(plain) for part in parts: if scripts.script_type(part) is scripts.Script.Ascii: data = data.replace(part, '') return data
def _is_kanji(kanji): return isinstance(kanji, unicode) and len(kanji) == 1 \ and scripts.script_type(kanji) == scripts.Script.Kanji
def eventFilter(self, object, event): if event.type() == QEvent.HoverLeave: object.setStyleSheet("QLabel { color: rgb(0, 0, 0); }") object.parent().info.hide() object.parent().allInfo.hide() object.parent().kanjiInfo.hide() object.parent().kanjiGroups.hide() desktop = QApplication.desktop().screenGeometry() object.parent().info.setGeometry(QRect(desktop.width() - H_INDENT - I_WIDTH - I_INDENT, desktop.height() - V_INDENT, I_WIDTH, I_HEIGHT)) if event.type() == QEvent.HoverEnter: object.setStyleSheet("QLabel { color: rgb(0, 5, 255); }") object.parent().info.item.setText(object.text()) reading = object.parent().srs.getWordPronunciationFromExample(object.text()) if reading != object.text() : object.parent().info.reading.setText(reading) else: object.parent().info.reading.setText(u'') #parsing word script = scripts.script_boundaries(object.text()) components = [] for cluster in script: if scripts.script_type(cluster) == scripts.Script.Kanji: for kanji in cluster: components = components + list(object.parent().rdk[kanji]) + list('\n') #setting radikals if len(components) > 0: components.pop() #remove last '\n' object.parent().info.components.setText(' '.join(components)) object.parent().info.show() if event.type() == QEvent.MouseButtonPress: # item context menu # if event.button() == Qt.MiddleButton: object.parent().info.hide() object.parent().allInfo.hide() object.parent().kanjiInfo.hide() script = scripts.script_boundaries(object.text()) resulting_info = u'' # kanji_groups = {} kanji_groups = OrderedDict() for cluster in script: if scripts.script_type(cluster) == scripts.Script.Kanji: # for kanji in cluster[::-1]: for kanji in cluster: similar = object.parent().groups.findSimilarKanji(kanji) try: kanji_groups[kanji] = similar[:similar.index(kanji)] + similar[similar.index(kanji) + 1:] except Exception: kanji_groups[kanji] = object.parent().groups.findSimilarKanji(kanji) log.debug(u'Not in group: ' + kanji) for kanji in kanji_groups: # for kanji in list(reversed(sorted(kanji_groups.keys()))): resulting_info += kanji + u' ~\t' for item in kanji_groups[kanji]: lookup = object.parent().kjd[item] resulting_info += " " + item + " <font style='font-family: Calibri; font-size: 12pt'>(" + lookup.gloss[0] + ")</font> " resulting_info += '<br/>' if resulting_info == u'': resulting_info = u'No such groups in Kanji.Odyssey!' object.parent().kanjiGroups.info.setText(resulting_info) object.parent().kanjiGroups.show() # kanji info # if event.button() == Qt.RightButton: object.parent().info.hide() object.parent().allInfo.hide() object.parent().kanjiGroups.hide() object.parent().kanjiInfo.info.setText(u'') script = scripts.script_boundaries(object.text()) resulting_info = u'' for cluster in script: if scripts.script_type(cluster) == scripts.Script.Kanji: for kanji in cluster: try: lookup = object.parent().kjd[kanji] kun = lookup.kun_readings; on = lookup.on_readings; gloss = lookup.gloss resulting_info += "<font style='font-family: " + Fonts.HiragiNoMyoutyouProW3 + "; font-size: 16.5pt'>(" + kanji + ")</font>\t" if len(kun) > 0: resulting_info += '<b>kun: </b>' + ', '.join(kun) + '\t' if len(on) > 0: resulting_info += '<b>on:</b>' + ', '.join(on) + '<br/>' if len(gloss) > 0: resulting_info += "<font style='font-family: Calibri; font-size: 12pt'>" + ", ".join(gloss) + "</font><br/>" except: components += kanji + '<br/>' if resulting_info != '': if resulting_info.count('<br/>') > 7: object.parent().kanjiInfo.setStyleSheet('QLabel { font-size: 13pt }') object.parent().kanjiInfo.info.setText(resulting_info.rstrip('<br/>')) else: object.parent().kanjiInfo.info.setText(u'No such kanji in kanjidic!') object.parent().kanjiInfo.show() # translation and strokes info # if event.button() == Qt.LeftButton: object.parent().kanjiInfo.hide() object.parent().info.hide() object.parent().kanjiGroups.hide() unfillLayout(object.parent().allInfo.layout) object.parent().allInfo.layout.setMargin(1) kanjiList = [] script = scripts.script_boundaries(object.text()) for cluster in script: if scripts.script_type(cluster) == scripts.Script.Kanji: for kanji in cluster: kanjiList.append(kanji) i=0; j=0; # kanji strokes if len(kanjiList) > 0: infile = open(PATH_TO_RES + STROKES + KANJI_MANIFEST, 'r') text = infile.read() infile.close() for kanji in kanjiList: if( text.find(kanji.encode('utf-8').encode('hex')) != -1): gif = QLabel() gif.setSizePolicy(QSizePolicy.Expanding, QSizePolicy.Expanding) gif.setAlignment(Qt.AlignCenter) movie = QMovie(PATH_TO_RES + STROKES + kanji.encode('utf-8').encode('hex') + '.gif', QByteArray(), self) movie.setCacheMode(QMovie.CacheAll) movie.setSpeed(150) gif.setMovie(movie) object.parent().allInfo.layout.addWidget(gif, i, j); j = j + 1 movie.start() i = i + 1 # words translation translations = QLabel(u'') translations.setFont(QFont('Calibri', 11)) translations.setWordWrap(True) translations.setAlignment(Qt.AlignCenter) try: search = object.parent().edict[object.parent().srs.getWordNonInflectedForm(object.text())] translationText = u'' variants = search.senses_by_reading()[object.parent().srs.getWordPronounciation(object.parent().srs.getWordNonInflectedForm(object.text()))][:3] variants = filter (lambda e: e != '(P)', variants) translationText += '<b>' + object.parent().srs.getWordPronunciationFromExample(object.text()) + '</b>:\t' + ', '.join(variants) translations.setText(translationText.rstrip('\n')) except: ### by reading search = object.parent().jmdict.lookupTranslationByReadingJoin(object.parent().srs.getWordPronounciation(object.parent().srs.getWordNonInflectedForm(object.text())), object.parent().options.getLookupLang()) if len(search) > 0: if len(search) > 5: search = search[:5] translations.setText('<b>' + object.parent().srs.getWordPronunciationFromExample(object.text())+ '</b>:\t' + ', '.join(search)) ### by kanji else: search = object.parent().jmdict.lookupItemByReading(object.parent().srs.getWordPronounciation(object.parent().srs.getWordNonInflectedForm(object.text()))) if len(search) > 0: lookup = object.parent().jmdict.lookupItemTranslationJoin(search[0], object.parent().options.getLookupLang()) if len(lookup) > 5: lookup = lookup[:5] translations.setText('<b>' + object.parent().srs.getWordPronunciationFromExample(object.text())+ '</b>:\t' + ', '.join(lookup)) ### nothing found if len(search) == 0: translations.setText(u'Alas, no translation in edict or jmdict!') if i > 0: separator = QFrame() separator.setFrameShape(QFrame.HLine) separator.setFrameShadow(QFrame.Sunken) object.parent().allInfo.layout.addWidget(separator, i, 0, 1, j); i = i + 1 object.parent().allInfo.layout.addWidget(translations, i, 0, 1, j) object.parent().allInfo.update() object.parent().allInfo.show() elif object.parent().allInfo.isVisible(): object.parent().allInfo.hide() object.parent().info.show() return False
def addKanjiToStudy(self): script = scripts.script_boundaries(self.itemsMenu.wordInfo.text()) for cluster in script: if scripts.script_type(cluster) == scripts.Script.Kanji: for kanji in cluster: self.db.addKanjiToDb(kanji)
def test_script_type_empty(self): self.assertEqual(scripts.script_type(''), scripts.Script.Unknown)