Exemplo n.º 1
0
    def translate(self, text):
        if isinstance(text, unicode):
            text = text.encode('utf-8')
        node = self._tagger.parseToNode(text)
        result = []
        while node:
            if node.stat>=2:
                node = node.next
                continue
            surface = node.surface.decode(u'utf-8')
            yomi = surface
            features = node.feature.decode('utf-8').split(',')
            if node.stat==0:
                yomi = features[7]
            yomi = jcconv.hira2kata(yomi)
            if features[1]==u'数':
                number = u''
                while True:
                    surface = node.surface.decode(u'utf-8')
                    features = node.feature.decode('utf-8').split(',')
                    if features[1]!=u'数':
                        break
                    number += surface
                    node = node.next
                number = self._ja2int(number)
                result.append(self._translate_int(number))
            else:
                result.append(self._translate_node(surface, yomi, features))
                node = node.next

        #長音と促音を変換
        text = u''.join(result)
        text = self.re_long.sub(u'\\1\\1', text)
        text = self.re_ltu.sub(u'\\1\\1', text)
        return text
Exemplo n.º 2
0
def kana_minus_dakuten(char):
    if is_katakana(char):
        hira = kata2hira(char)
        hira = __by_dakuten.get(hira, hira)
        return hira2kata(hira)
    else:
        return __by_dakuten.get(char, char)
Exemplo n.º 3
0
def kana_minus_dakuten(char):
    if is_katakana(char):
        hira = kata2hira(char)
        hira = __by_dakuten.get(hira, hira)
        return hira2kata(hira)
    else:
        return __by_dakuten.get(char, char)
Exemplo n.º 4
0
def inject_furigana(text):
    '''
    Returns 2-tuple of (text_with_furigana, furigana_positions).
    '''
    furigana_positions = []

    injected_text = []
    remaining_text = text
    current_offset = 0

    # https://runble1.com/python-mecab-morphological-analysis/
    tagger.parse('')

    node = tagger.parseToNode(text.encode('utf8'))
    while node:
        surface = node.surface.decode('utf8')

        # Add any skipped text.
        node_index_in_remaining_text = remaining_text.find(surface)
        injected_text.append(remaining_text[:node_index_in_remaining_text])
        remaining_text = (remaining_text[node_index_in_remaining_text +
                                         len(surface):])

        reading = _reading(node)

        # Skip text?
        if (reading is None or reading == surface
                or jcconv.hira2kata(reading) == surface):
            injected_text.append(surface)
            current_offset += node_index_in_remaining_text + len(surface)
            node = node.next
            continue

        suffix = ''
        redundant_length = 0
        for surface_char, reading_char in zip(
                reversed(surface),
                reversed(reading),
        ):
            if surface_char != reading_char:
                break
            redundant_length += 1
        if redundant_length > 0:
            reading = reading[:-redundant_length]
            suffix = surface[-redundant_length:]
            surface = surface[:-redundant_length]

        injected_text.append(u'|{}《{}》{}'.format(surface, reading, suffix))

        current_offset += node_index_in_remaining_text
        furigana_positions.append(
            (current_offset, current_offset + len(surface), reading))
        current_offset += len(surface) + len(suffix)

        node = node.next

    injected_text.append(remaining_text)

    return (u''.join(injected_text), furigana_positions)
Exemplo n.º 5
0
def char_to_base_vowel(char):
    char = kana_minus_dakuten(char)
    translated = __to_vowels.get(char, False) or __to_vowels.get(hira2kata(char), False)

    if translated is False:
        raise Exception(u"Can't convert")

    return translated
Exemplo n.º 6
0
def inject_furigana(text):
    '''
    Returns 2-tuple of (text_with_furigana, furigana_positions).
    '''
    furigana_positions = []

    injected_text = []
    remaining_text = text
    current_offset = 0

    # https://runble1.com/python-mecab-morphological-analysis/
    tagger.parse('')

    node = tagger.parseToNode(text.encode('utf8'))
    while node:
        surface = node.surface.decode('utf8')

        # Add any skipped text.
        node_index_in_remaining_text = remaining_text.find(surface)
        injected_text.append(remaining_text[:node_index_in_remaining_text])
        remaining_text = (
            remaining_text[node_index_in_remaining_text + len(surface):])

        reading = _reading(node)

        # Skip text?
        if (reading is None
                or reading == surface
                or jcconv.hira2kata(reading) == surface):
            injected_text.append(surface)
            current_offset += node_index_in_remaining_text + len(surface)
            node = node.next
            continue

        suffix = ''
        redundant_length = 0
        for surface_char, reading_char in zip(
            reversed(surface), reversed(reading),
        ):
            if surface_char != reading_char:
                break
            redundant_length += 1
        if redundant_length > 0:
            reading = reading[:-redundant_length]
            suffix = surface[-redundant_length:]
            surface = surface[:-redundant_length]

        injected_text.append(u'|{}《{}》{}'.format(surface, reading, suffix))

        current_offset += node_index_in_remaining_text
        furigana_positions.append((current_offset, current_offset + len(surface), reading))
        current_offset += len(surface) + len(suffix)

        node = node.next

    injected_text.append(remaining_text)

    return (u''.join(injected_text), furigana_positions)
Exemplo n.º 7
0
def char_to_base_vowel(char):
    char = kana_minus_dakuten(char)
    translated = __to_vowels.get(char, False) or __to_vowels.get(
        hira2kata(char), False)

    if translated is False:
        raise Exception(u"Can't convert")

    return translated
Exemplo n.º 8
0
def kana_plus_mini(char):
    yield char

    is_kata = is_katakana(char)
    if is_kata:
        char = kata2hira(char)

    for char in __to_mini.get(char, ''):
        yield hira2kata(char) if is_kata else char
Exemplo n.º 9
0
def kana_plus_mini(char):
    yield char

    is_kata = is_katakana(char)
    if is_kata:
        char = kata2hira(char)

    for char in __to_mini.get(char, ''):
        yield hira2kata(char) if is_kata else char
Exemplo n.º 10
0
def encode_katakana(text):
    """I don't think this quite works yet."""
    encoded = []
    for char in text:
        if jcconv:
            # try to convert japanese text to half-katakanas
            char = jcconv.kata2half(jcconv.hira2kata(char))
            # TODO: "the conversion may result in multiple characters"
            # When? What should we do about it?

        if char in TXT_ENC_KATAKANA_MAP:
            encoded.append(TXT_ENC_KATAKANA_MAP[char])
        else:
            pass
    return b"".join(encoded)
Exemplo n.º 11
0
Arquivo: db.py Projeto: Xifax/suzu
 def lookupItemByReading(self, item):
     '''Looks up words (kanji/kana) by katakana and hiraga reading''' 
     lookup = self.db.r_ele.filter(self.db.r_ele.value==item).all()
     
     results = []
     if len(lookup) > 0:
         for item in lookup:
             words = self.db.k_ele.filter(self.db.k_ele.fk==item.fk).all()
             for word in words:
                 results.append(word.value)
     else:
         lookup = self.db.r_ele.filter(self.db.r_ele.value==hira2kata(item)).all()
         for item in lookup:
             words = self.db.r_ele.filter(self.db.r_ele.fk==item.fk).all()
             for reading in words:
                 results.append(reading.value)
     return results
Exemplo n.º 12
0
    def _yomi(self, text):
        if isinstance(text, unicode):
            text = text.encode("utf-8")
        node = self._tagger.parseToNode(text)
        yomi = ""
        while node:
            if node.stat >= 2:
                node = node.next
                continue
            if node.stat == 0:
                features = node.feature.split(",")
                yomi += features[7]
            else:
                yomi += node.surface

            node = node.next
        return jcconv.hira2kata(yomi.decode("utf-8"))
Exemplo n.º 13
0
Arquivo: db.py Projeto: Xifax/suzu
 def looseLookupByReadingJoin(self, item, pre=False, post=True):
     '''Quite slow, but faster than without join'''
     if pre:
         query = u'%' + item
     elif post:
         query = item + u'%'
     
     join = self.db.join(self.db.k_ele, self.db.r_ele, self.db.k_ele.fk==self.db.r_ele.fk, isouter=True)
     table = self.db.with_labels(join)
     lookup = table.filter(table.r_ele_value.like(query)).all()
     
     result = []
     if len(lookup) > 0:
         for item in lookup:
             result.append(item.k_ele_value)
     else:
         lookup = table.filter(table.r_ele_value.like(hira2kata(query))).all() 
         for item in lookup:
             result.append(item.k_ele_value)
         
     return removeDuplicates(result)
Exemplo n.º 14
0
        def encode_char(char):  
            """ 
            Encodes a single utf-8 character into a sequence of 
            esc-pos code page change instructions and character declarations 
            """ 
            char_utf8 = char.encode('utf-8')
            encoded  = ''
            encoding = self.encoding # we reuse the last encoding to prevent code page switches at every character
            encodings = {
                    # TODO use ordering to prevent useless switches
                    # TODO Support other encodings not natively supported by python ( Thai, Khazakh, Kanjis )
                    'cp437': TXT_ENC_PC437,
                    'cp850': TXT_ENC_PC850,
                    'cp852': TXT_ENC_PC852,
                    'cp857': TXT_ENC_PC857,
                    'cp858': TXT_ENC_PC858,
                    'cp860': TXT_ENC_PC860,
                    'cp863': TXT_ENC_PC863,
                    'cp865': TXT_ENC_PC865,
                    'cp866': TXT_ENC_PC866,
                    'cp862': TXT_ENC_PC862,
                    'cp720': TXT_ENC_PC720,
                    'iso8859_2': TXT_ENC_8859_2,
                    'iso8859_7': TXT_ENC_8859_7,
                    'iso8859_9': TXT_ENC_8859_9,
                    'cp1254'   : TXT_ENC_WPC1254,
                    'cp1255'   : TXT_ENC_WPC1255,
                    'cp1256'   : TXT_ENC_WPC1256,
                    'cp1257'   : TXT_ENC_WPC1257,
                    'cp1258'   : TXT_ENC_WPC1258,
                    'katakana' : TXT_ENC_KATAKANA,
            }
            remaining = copy.copy(encodings)

            if not encoding :
                encoding = 'cp437'

            while True: # Trying all encoding until one succeeds
                try:
                    if encoding == 'katakana': # Japanese characters
                        if jcconv:
                            # try to convert japanese text to a half-katakanas 
                            kata = jcconv.kata2half(jcconv.hira2kata(char_utf8))
                            if kata != char_utf8:
                                self.extra_chars += len(kata.decode('utf-8')) - 1
                                # the conversion may result in multiple characters
                                return encode_str(kata.decode('utf-8')) 
                        else:
                             kata = char_utf8
                        
                        if kata in TXT_ENC_KATAKANA_MAP:
                            encoded = TXT_ENC_KATAKANA_MAP[kata]
                            break
                        else: 
                            raise ValueError()
                    else:
                        encoded = char.encode(encoding)
                        break

                except ValueError: #the encoding failed, select another one and retry
                    if encoding in remaining:
                        del remaining[encoding]
                    if len(remaining) >= 1:
                        encoding = remaining.items()[0][0]
                    else:
                        encoding = 'cp437'
                        encoded  = '\xb1'    # could not encode, output error character
                        break;

            if encoding != self.encoding:
                # if the encoding changed, remember it and prefix the character with
                # the esc-pos encoding change sequence
                self.encoding = encoding
                encoded = encodings[encoding] + encoded

            return encoded
Exemplo n.º 15
0
        def encode_char(char):
            """ 
            Encodes a single utf-8 character into a sequence of 
            esc-pos code page change instructions and character declarations 
            """
            char_utf8 = char.encode('utf-8')
            encoded = ''
            encoding = self.encoding  # we reuse the last encoding to prevent code page switches at every character
            encodings = {
                # TODO use ordering to prevent useless switches
                # TODO Support other encodings not natively supported by python ( Thai, Khazakh, Kanjis )
                'cp437': TXT_ENC_PC437,
                'cp850': TXT_ENC_PC850,
                'cp852': TXT_ENC_PC852,
                'cp857': TXT_ENC_PC857,
                'cp858': TXT_ENC_PC858,
                'cp860': TXT_ENC_PC860,
                'cp863': TXT_ENC_PC863,
                'cp865': TXT_ENC_PC865,
                'cp1251':
                TXT_ENC_WPC1251,  # win-1251 covers more cyrillic symbols than cp866
                'cp866': TXT_ENC_PC866,
                'cp862': TXT_ENC_PC862,
                'cp720': TXT_ENC_PC720,
                'cp936': TXT_ENC_PC936,
                'iso8859_2': TXT_ENC_8859_2,
                'iso8859_7': TXT_ENC_8859_7,
                'iso8859_9': TXT_ENC_8859_9,
                'cp1254': TXT_ENC_WPC1254,
                'cp1255': TXT_ENC_WPC1255,
                'cp1256': TXT_ENC_WPC1256,
                'cp1257': TXT_ENC_WPC1257,
                'cp1258': TXT_ENC_WPC1258,
                'katakana': TXT_ENC_KATAKANA,
            }
            remaining = copy.copy(encodings)

            if not encoding:
                encoding = 'cp437'

            while True:  # Trying all encoding until one succeeds
                try:
                    if encoding == 'katakana':  # Japanese characters
                        if jcconv:
                            # try to convert japanese text to a half-katakanas
                            kata = jcconv.kata2half(
                                jcconv.hira2kata(char_utf8))
                            if kata != char_utf8:
                                self.extra_chars += len(
                                    kata.decode('utf-8')) - 1
                                # the conversion may result in multiple characters
                                return encode_str(kata.decode('utf-8'))
                        else:
                            kata = char_utf8

                        if kata in TXT_ENC_KATAKANA_MAP:
                            encoded = TXT_ENC_KATAKANA_MAP[kata]
                            break
                        else:
                            raise ValueError()
                    else:
                        # First 127 symbols are covered by cp437.
                        # Extended range is covered by different encodings.
                        encoded = char.encode(encoding)
                        if ord(encoded) <= 127:
                            encoding = 'cp437'
                        break

                except (UnicodeEncodeError, UnicodeWarning, TypeError,
                        ValueError):
                    #the encoding failed, select another one and retry
                    if encoding in remaining:
                        del remaining[encoding]
                    if len(remaining) >= 1:
                        (encoding, _) = remaining.popitem()
                    else:
                        encoding = 'cp437'
                        encoded = b'\xb1'  # could not encode, output error character
                        break

            if encoding != self.encoding:
                # if the encoding changed, remember it and prefix the character with
                # the esc-pos encoding change sequence
                self.encoding = encoding
                encoded = bytes(encodings[encoding], 'utf-8') + encoded

            return encoded
Exemplo n.º 16
0
    def updateLookupResults(self, query):
        self.lookupResults.clearContents()
        self.lookupResults.setRowCount(0)

        results = []
                
        if self.comboDictionary.currentText() == 'edict':
            lookup = self.qdict.lookupItemByReading(query)
            for item in lookup:
                try:
                    results.append(self.edict[item])
                except:
                    pass
        elif self.comboDictionary.currentText() == 'jmdict':
            if self.checkLoose.isChecked():
                if not self.switchPreOrPost.isChecked():
                    results = list(self.qdict.dictionaryR[query + '.*'])        

                    if len(results) == 0: 
                        results = list(self.qdict.dictionaryR[hira2kata(query)])
                        
                    #results = self.qdict.looseLookupByReadingJoin(query, self.switchPreOrPost.isChecked(), not self.switchPreOrPost.isChecked())[:limit]
                else:
                    results = list(self.qdict.dictionaryR[u'.*' + query + '$'])
                    
                    if len(results) == 0: 
                        results = list(self.qdict.dictionaryR[hira2kata(query)])
            else:
                results = list(self.qdict.dictionaryR[query + u'$'])
                
                if len(results) == 0: 
                    results = list(self.qdict.dictionaryR[hira2kata(query)])

                #results = self.qdict.lookupItemByReading(query)
                #results = self.qdict.lookupTranslationByReadingJoin(query)  #TODO: add language chooser
                #results = self.qdict.lookupAllByReading(query)
                                       
        i = 0
        
        if self.comboDictionary.currentText() == 'jmdict':
                if not self.checkInline.isChecked():
                    for item in sorted(results):
                        for key in sorted(item.keys()):
                            if key != 'kana':
                                self.lookupResults.insertRow(i)
                                
                                self.lookupResults.setItem(i, 0, QTableWidgetItem(u'x'))
#                                self.lookupResults.setItem(i, 1, QTableWidgetItem(key))
                                #self.lookupResults.setItem(i, 2, QTableWidgetItem(item['kana']))
                                #self.lookupResults.setItem(i, 3, QTableWidgetItem(', '.join(item[key])))
                                
                                word = QTableWidgetItem(key); word.setFont(QFont(Fonts.TukusiMyoutyouProLB, 18))
                                self.lookupResults.setItem(i, 1, word)
                                
                                kana = QTableWidgetItem(item['kana']); kana.setFont(QFont(Fonts.TukusiMyoutyouProLB, 14))
                                self.lookupResults.setItem(i, 2, kana)
                                
                                senses = QTableWidgetItem(', '.join(item[key])); senses.setFont(QFont('Calibri', 11))
                                self.lookupResults.setItem(i, 3, senses)
                                
                                    
                                i = i + 1
                        if i > self.itemsLimit.value() : break
                else:
                    for item in results:
                        for key in item:
                            if key != 'kana':
                                for sense in item[key]:
                                    self.lookupResults.insertRow(i)
                                    
                                    self.lookupResults.setItem(i, 0, QTableWidgetItem(u'x'))
                                    #self.lookupResults.setItem(i, 1, QTableWidgetItem(key))
                                    #self.lookupResults.setItem(i, 2, QTableWidgetItem(item['kana']))
                                    #self.lookupResults.setItem(i, 3, QTableWidgetItem(sense))
                                    
                                    word = QTableWidgetItem(key); word.setFont(QFont(Fonts.TukusiMyoutyouProLB, 18))
                                    self.lookupResults.setItem(i, 1, word)
                                    
                                    kana = QTableWidgetItem(item['kana']); kana.setFont(QFont(Fonts.TukusiMyoutyouProLB, 14))
                                    self.lookupResults.setItem(i, 2, kana)
                                    
                                    senses = QTableWidgetItem(sense); senses.setFont(QFont('Calibri', 11))
                                    self.lookupResults.setItem(i, 3, senses)
                            
                                i = i + 1
                        if i > self.itemsLimit.value() : break
        
        elif self.comboDictionary.currentText() == 'edict':
            for item in results:
                self.lookupResults.insertRow(i)
            
                self.lookupResults.setItem(i, 1, QTableWidgetItem(item.word))
                #self.lookupResults.setItem(i, 2, QTableWidgetItem(query))
                self.lookupResults.setItem(i, 3, QTableWidgetItem(', '.join(item.senses)))

                i = i + 1
                
#                self.lookupResults.setItem(i, 1, QTableWidgetItem(item['word']))
#                self.lookupResults.setItem(i, 2, QTableWidgetItem(item['kana']))
#                #self.lookupResults.setItem(i, 3, QTableWidgetItem(', '.join(item['sense'])))
#                self.lookupResults.setItem(i, 3, QTableWidgetItem(item['sense']))
                
        self.lookupResults.resizeColumnsToContents()
        self.lookupResults.resizeRowsToContents()