def translate(self, text): if isinstance(text, unicode): text = text.encode('utf-8') node = self._tagger.parseToNode(text) result = [] while node: if node.stat>=2: node = node.next continue surface = node.surface.decode(u'utf-8') yomi = surface features = node.feature.decode('utf-8').split(',') if node.stat==0: yomi = features[7] yomi = jcconv.hira2kata(yomi) if features[1]==u'数': number = u'' while True: surface = node.surface.decode(u'utf-8') features = node.feature.decode('utf-8').split(',') if features[1]!=u'数': break number += surface node = node.next number = self._ja2int(number) result.append(self._translate_int(number)) else: result.append(self._translate_node(surface, yomi, features)) node = node.next #長音と促音を変換 text = u''.join(result) text = self.re_long.sub(u'\\1\\1', text) text = self.re_ltu.sub(u'\\1\\1', text) return text
def kana_minus_dakuten(char): if is_katakana(char): hira = kata2hira(char) hira = __by_dakuten.get(hira, hira) return hira2kata(hira) else: return __by_dakuten.get(char, char)
def inject_furigana(text): ''' Returns 2-tuple of (text_with_furigana, furigana_positions). ''' furigana_positions = [] injected_text = [] remaining_text = text current_offset = 0 # https://runble1.com/python-mecab-morphological-analysis/ tagger.parse('') node = tagger.parseToNode(text.encode('utf8')) while node: surface = node.surface.decode('utf8') # Add any skipped text. node_index_in_remaining_text = remaining_text.find(surface) injected_text.append(remaining_text[:node_index_in_remaining_text]) remaining_text = (remaining_text[node_index_in_remaining_text + len(surface):]) reading = _reading(node) # Skip text? if (reading is None or reading == surface or jcconv.hira2kata(reading) == surface): injected_text.append(surface) current_offset += node_index_in_remaining_text + len(surface) node = node.next continue suffix = '' redundant_length = 0 for surface_char, reading_char in zip( reversed(surface), reversed(reading), ): if surface_char != reading_char: break redundant_length += 1 if redundant_length > 0: reading = reading[:-redundant_length] suffix = surface[-redundant_length:] surface = surface[:-redundant_length] injected_text.append(u'|{}《{}》{}'.format(surface, reading, suffix)) current_offset += node_index_in_remaining_text furigana_positions.append( (current_offset, current_offset + len(surface), reading)) current_offset += len(surface) + len(suffix) node = node.next injected_text.append(remaining_text) return (u''.join(injected_text), furigana_positions)
def char_to_base_vowel(char): char = kana_minus_dakuten(char) translated = __to_vowels.get(char, False) or __to_vowels.get(hira2kata(char), False) if translated is False: raise Exception(u"Can't convert") return translated
def inject_furigana(text): ''' Returns 2-tuple of (text_with_furigana, furigana_positions). ''' furigana_positions = [] injected_text = [] remaining_text = text current_offset = 0 # https://runble1.com/python-mecab-morphological-analysis/ tagger.parse('') node = tagger.parseToNode(text.encode('utf8')) while node: surface = node.surface.decode('utf8') # Add any skipped text. node_index_in_remaining_text = remaining_text.find(surface) injected_text.append(remaining_text[:node_index_in_remaining_text]) remaining_text = ( remaining_text[node_index_in_remaining_text + len(surface):]) reading = _reading(node) # Skip text? if (reading is None or reading == surface or jcconv.hira2kata(reading) == surface): injected_text.append(surface) current_offset += node_index_in_remaining_text + len(surface) node = node.next continue suffix = '' redundant_length = 0 for surface_char, reading_char in zip( reversed(surface), reversed(reading), ): if surface_char != reading_char: break redundant_length += 1 if redundant_length > 0: reading = reading[:-redundant_length] suffix = surface[-redundant_length:] surface = surface[:-redundant_length] injected_text.append(u'|{}《{}》{}'.format(surface, reading, suffix)) current_offset += node_index_in_remaining_text furigana_positions.append((current_offset, current_offset + len(surface), reading)) current_offset += len(surface) + len(suffix) node = node.next injected_text.append(remaining_text) return (u''.join(injected_text), furigana_positions)
def char_to_base_vowel(char): char = kana_minus_dakuten(char) translated = __to_vowels.get(char, False) or __to_vowels.get( hira2kata(char), False) if translated is False: raise Exception(u"Can't convert") return translated
def kana_plus_mini(char): yield char is_kata = is_katakana(char) if is_kata: char = kata2hira(char) for char in __to_mini.get(char, ''): yield hira2kata(char) if is_kata else char
def encode_katakana(text): """I don't think this quite works yet.""" encoded = [] for char in text: if jcconv: # try to convert japanese text to half-katakanas char = jcconv.kata2half(jcconv.hira2kata(char)) # TODO: "the conversion may result in multiple characters" # When? What should we do about it? if char in TXT_ENC_KATAKANA_MAP: encoded.append(TXT_ENC_KATAKANA_MAP[char]) else: pass return b"".join(encoded)
def lookupItemByReading(self, item): '''Looks up words (kanji/kana) by katakana and hiraga reading''' lookup = self.db.r_ele.filter(self.db.r_ele.value==item).all() results = [] if len(lookup) > 0: for item in lookup: words = self.db.k_ele.filter(self.db.k_ele.fk==item.fk).all() for word in words: results.append(word.value) else: lookup = self.db.r_ele.filter(self.db.r_ele.value==hira2kata(item)).all() for item in lookup: words = self.db.r_ele.filter(self.db.r_ele.fk==item.fk).all() for reading in words: results.append(reading.value) return results
def _yomi(self, text): if isinstance(text, unicode): text = text.encode("utf-8") node = self._tagger.parseToNode(text) yomi = "" while node: if node.stat >= 2: node = node.next continue if node.stat == 0: features = node.feature.split(",") yomi += features[7] else: yomi += node.surface node = node.next return jcconv.hira2kata(yomi.decode("utf-8"))
def looseLookupByReadingJoin(self, item, pre=False, post=True): '''Quite slow, but faster than without join''' if pre: query = u'%' + item elif post: query = item + u'%' join = self.db.join(self.db.k_ele, self.db.r_ele, self.db.k_ele.fk==self.db.r_ele.fk, isouter=True) table = self.db.with_labels(join) lookup = table.filter(table.r_ele_value.like(query)).all() result = [] if len(lookup) > 0: for item in lookup: result.append(item.k_ele_value) else: lookup = table.filter(table.r_ele_value.like(hira2kata(query))).all() for item in lookup: result.append(item.k_ele_value) return removeDuplicates(result)
def encode_char(char): """ Encodes a single utf-8 character into a sequence of esc-pos code page change instructions and character declarations """ char_utf8 = char.encode('utf-8') encoded = '' encoding = self.encoding # we reuse the last encoding to prevent code page switches at every character encodings = { # TODO use ordering to prevent useless switches # TODO Support other encodings not natively supported by python ( Thai, Khazakh, Kanjis ) 'cp437': TXT_ENC_PC437, 'cp850': TXT_ENC_PC850, 'cp852': TXT_ENC_PC852, 'cp857': TXT_ENC_PC857, 'cp858': TXT_ENC_PC858, 'cp860': TXT_ENC_PC860, 'cp863': TXT_ENC_PC863, 'cp865': TXT_ENC_PC865, 'cp866': TXT_ENC_PC866, 'cp862': TXT_ENC_PC862, 'cp720': TXT_ENC_PC720, 'iso8859_2': TXT_ENC_8859_2, 'iso8859_7': TXT_ENC_8859_7, 'iso8859_9': TXT_ENC_8859_9, 'cp1254' : TXT_ENC_WPC1254, 'cp1255' : TXT_ENC_WPC1255, 'cp1256' : TXT_ENC_WPC1256, 'cp1257' : TXT_ENC_WPC1257, 'cp1258' : TXT_ENC_WPC1258, 'katakana' : TXT_ENC_KATAKANA, } remaining = copy.copy(encodings) if not encoding : encoding = 'cp437' while True: # Trying all encoding until one succeeds try: if encoding == 'katakana': # Japanese characters if jcconv: # try to convert japanese text to a half-katakanas kata = jcconv.kata2half(jcconv.hira2kata(char_utf8)) if kata != char_utf8: self.extra_chars += len(kata.decode('utf-8')) - 1 # the conversion may result in multiple characters return encode_str(kata.decode('utf-8')) else: kata = char_utf8 if kata in TXT_ENC_KATAKANA_MAP: encoded = TXT_ENC_KATAKANA_MAP[kata] break else: raise ValueError() else: encoded = char.encode(encoding) break except ValueError: #the encoding failed, select another one and retry if encoding in remaining: del remaining[encoding] if len(remaining) >= 1: encoding = remaining.items()[0][0] else: encoding = 'cp437' encoded = '\xb1' # could not encode, output error character break; if encoding != self.encoding: # if the encoding changed, remember it and prefix the character with # the esc-pos encoding change sequence self.encoding = encoding encoded = encodings[encoding] + encoded return encoded
def encode_char(char): """ Encodes a single utf-8 character into a sequence of esc-pos code page change instructions and character declarations """ char_utf8 = char.encode('utf-8') encoded = '' encoding = self.encoding # we reuse the last encoding to prevent code page switches at every character encodings = { # TODO use ordering to prevent useless switches # TODO Support other encodings not natively supported by python ( Thai, Khazakh, Kanjis ) 'cp437': TXT_ENC_PC437, 'cp850': TXT_ENC_PC850, 'cp852': TXT_ENC_PC852, 'cp857': TXT_ENC_PC857, 'cp858': TXT_ENC_PC858, 'cp860': TXT_ENC_PC860, 'cp863': TXT_ENC_PC863, 'cp865': TXT_ENC_PC865, 'cp1251': TXT_ENC_WPC1251, # win-1251 covers more cyrillic symbols than cp866 'cp866': TXT_ENC_PC866, 'cp862': TXT_ENC_PC862, 'cp720': TXT_ENC_PC720, 'cp936': TXT_ENC_PC936, 'iso8859_2': TXT_ENC_8859_2, 'iso8859_7': TXT_ENC_8859_7, 'iso8859_9': TXT_ENC_8859_9, 'cp1254': TXT_ENC_WPC1254, 'cp1255': TXT_ENC_WPC1255, 'cp1256': TXT_ENC_WPC1256, 'cp1257': TXT_ENC_WPC1257, 'cp1258': TXT_ENC_WPC1258, 'katakana': TXT_ENC_KATAKANA, } remaining = copy.copy(encodings) if not encoding: encoding = 'cp437' while True: # Trying all encoding until one succeeds try: if encoding == 'katakana': # Japanese characters if jcconv: # try to convert japanese text to a half-katakanas kata = jcconv.kata2half( jcconv.hira2kata(char_utf8)) if kata != char_utf8: self.extra_chars += len( kata.decode('utf-8')) - 1 # the conversion may result in multiple characters return encode_str(kata.decode('utf-8')) else: kata = char_utf8 if kata in TXT_ENC_KATAKANA_MAP: encoded = TXT_ENC_KATAKANA_MAP[kata] break else: raise ValueError() else: # First 127 symbols are covered by cp437. # Extended range is covered by different encodings. encoded = char.encode(encoding) if ord(encoded) <= 127: encoding = 'cp437' break except (UnicodeEncodeError, UnicodeWarning, TypeError, ValueError): #the encoding failed, select another one and retry if encoding in remaining: del remaining[encoding] if len(remaining) >= 1: (encoding, _) = remaining.popitem() else: encoding = 'cp437' encoded = b'\xb1' # could not encode, output error character break if encoding != self.encoding: # if the encoding changed, remember it and prefix the character with # the esc-pos encoding change sequence self.encoding = encoding encoded = bytes(encodings[encoding], 'utf-8') + encoded return encoded
def updateLookupResults(self, query): self.lookupResults.clearContents() self.lookupResults.setRowCount(0) results = [] if self.comboDictionary.currentText() == 'edict': lookup = self.qdict.lookupItemByReading(query) for item in lookup: try: results.append(self.edict[item]) except: pass elif self.comboDictionary.currentText() == 'jmdict': if self.checkLoose.isChecked(): if not self.switchPreOrPost.isChecked(): results = list(self.qdict.dictionaryR[query + '.*']) if len(results) == 0: results = list(self.qdict.dictionaryR[hira2kata(query)]) #results = self.qdict.looseLookupByReadingJoin(query, self.switchPreOrPost.isChecked(), not self.switchPreOrPost.isChecked())[:limit] else: results = list(self.qdict.dictionaryR[u'.*' + query + '$']) if len(results) == 0: results = list(self.qdict.dictionaryR[hira2kata(query)]) else: results = list(self.qdict.dictionaryR[query + u'$']) if len(results) == 0: results = list(self.qdict.dictionaryR[hira2kata(query)]) #results = self.qdict.lookupItemByReading(query) #results = self.qdict.lookupTranslationByReadingJoin(query) #TODO: add language chooser #results = self.qdict.lookupAllByReading(query) i = 0 if self.comboDictionary.currentText() == 'jmdict': if not self.checkInline.isChecked(): for item in sorted(results): for key in sorted(item.keys()): if key != 'kana': self.lookupResults.insertRow(i) self.lookupResults.setItem(i, 0, QTableWidgetItem(u'x')) # self.lookupResults.setItem(i, 1, QTableWidgetItem(key)) #self.lookupResults.setItem(i, 2, QTableWidgetItem(item['kana'])) #self.lookupResults.setItem(i, 3, QTableWidgetItem(', '.join(item[key]))) word = QTableWidgetItem(key); word.setFont(QFont(Fonts.TukusiMyoutyouProLB, 18)) self.lookupResults.setItem(i, 1, word) kana = QTableWidgetItem(item['kana']); kana.setFont(QFont(Fonts.TukusiMyoutyouProLB, 14)) self.lookupResults.setItem(i, 2, kana) senses = QTableWidgetItem(', '.join(item[key])); senses.setFont(QFont('Calibri', 11)) self.lookupResults.setItem(i, 3, senses) i = i + 1 if i > self.itemsLimit.value() : break else: for item in results: for key in item: if key != 'kana': for sense in item[key]: self.lookupResults.insertRow(i) self.lookupResults.setItem(i, 0, QTableWidgetItem(u'x')) #self.lookupResults.setItem(i, 1, QTableWidgetItem(key)) #self.lookupResults.setItem(i, 2, QTableWidgetItem(item['kana'])) #self.lookupResults.setItem(i, 3, QTableWidgetItem(sense)) word = QTableWidgetItem(key); word.setFont(QFont(Fonts.TukusiMyoutyouProLB, 18)) self.lookupResults.setItem(i, 1, word) kana = QTableWidgetItem(item['kana']); kana.setFont(QFont(Fonts.TukusiMyoutyouProLB, 14)) self.lookupResults.setItem(i, 2, kana) senses = QTableWidgetItem(sense); senses.setFont(QFont('Calibri', 11)) self.lookupResults.setItem(i, 3, senses) i = i + 1 if i > self.itemsLimit.value() : break elif self.comboDictionary.currentText() == 'edict': for item in results: self.lookupResults.insertRow(i) self.lookupResults.setItem(i, 1, QTableWidgetItem(item.word)) #self.lookupResults.setItem(i, 2, QTableWidgetItem(query)) self.lookupResults.setItem(i, 3, QTableWidgetItem(', '.join(item.senses))) i = i + 1 # self.lookupResults.setItem(i, 1, QTableWidgetItem(item['word'])) # self.lookupResults.setItem(i, 2, QTableWidgetItem(item['kana'])) # #self.lookupResults.setItem(i, 3, QTableWidgetItem(', '.join(item['sense']))) # self.lookupResults.setItem(i, 3, QTableWidgetItem(item['sense'])) self.lookupResults.resizeColumnsToContents() self.lookupResults.resizeRowsToContents()