示例#1
0
    def prob(self, grapheme, reading, alt_reading):
        """
        Returns the probability of P(r|k), using the formula:
        P(r|k) ~ (alpha)P_raw(r|k) + (1-alpha)P(r|r*)P(r*|k).
        """
        if scripts.to_hiragana(grapheme) == scripts.to_hiragana(alt_reading):
            # Special case: where the segment is phonetic.
            return 1.0

        # We only handle entire kanji segments.
        assert scripts.script_types(grapheme) == set([scripts.Script.Kanji])

        alpha = settings.ALTERNATION_ALPHA
        assert 0 <= alpha <= 1
        try:
            rawProb = self.raw_freq_dist[grapheme].freq(alt_reading)
        except KeyError:
            rawProb = 0.0

        normalizedProb = self.normalized_freq_dist[grapheme].freq(reading)
        alternationProb = self.alternation_dist[reading].freq(alt_reading)

        result = alpha*rawProb + (1-alpha)*normalizedProb*alternationProb

        return result
示例#2
0
    def test_fetch_scripts(self):
        """
        Test fetching of hiragana and katakana, and converting between them.
        """
        hiragana = 'ぁあぃいぅうぇえぉおかがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとどなにぬねのはばぱひびぴふぶぷへべぺほぼぽまみむめもゃやゅゆょよらりるれろゎわゐゑをんゔゕゖ'  # nopep8
        self.assertEqual(scripts.get_script(Script.Hiragana), hiragana)
        katakana = 'ァアィイゥウェエォオカガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトドナニヌネノハバパヒビピフブプヘベペホボポマミムメモャヤュユョヨラリルレロヮワヰヱヲンヴヵヶ'  # nopep8
        self.assertEqual(scripts.get_script(Script.Katakana), katakana)

        self.assertEqual(scripts.to_hiragana(katakana), hiragana)
        self.assertEqual(scripts.to_katakana(hiragana), katakana)
示例#3
0
    def __init__(self):
        ConditionalFreqDist.__init__(self)

        kanji_script = scripts.Script.Kanji
        i_stream = sopen(_edict_aligned_file, 'r')
        for line in i_stream:
            alignment = Alignment.from_line(line)
            for (g, p) in alignment:
                if scripts.contains_script(kanji_script, g):
                    self[g].inc(scripts.to_hiragana(p))
        i_stream.close()
        return
    def __init__(self):
        ConditionalFreqDist.__init__(self)

        kanji_script = scripts.Script.Kanji
        i_stream = sopen(_edict_aligned_file, 'r')
        for line in i_stream:
            alignment = Alignment.from_line(line)
            for (g, p) in alignment:
                if scripts.contains_script(kanji_script, g):
                    self[g].inc(scripts.to_hiragana(p))
        i_stream.close()
        return
示例#5
0
def expand_long_vowels(kana_string):
    """
    Expands whatever long vowels are possible to expand.

        >>> a = expand_long_vowels(u'すー')
        >>> b = u'すう'
        >>> a == b
        True
    """
    script_converters = {
        scripts.Script.Hiragana: lambda x: x,
        scripts.Script.Katakana: scripts.to_katakana
    }

    table = kana_table.KanaTable.get_cached()

    out_string = ''
    for segment in scripts.script_boundaries(kana_string):
        if len(segment):
            char_type = scripts.script_type(segment)

            if char_type not in script_converters:
                out_string += segment
                continue

            reverse_operation = script_converters[char_type]
            segment = scripts.to_hiragana(segment)
        else:
            continue

        for m in _long_finder.finditer(segment):
            i = m.start()
            vowel = table.to_vowel_line(segment[i - 1])
            segment = segment[:i] + vowel + segment[i + 1:]

        out_string += reverse_operation(segment)

    return out_string
示例#6
0
def expand_long_vowels(kana_string):
    """
    Expands whatever long vowels are possible to expand.

        >>> a = expand_long_vowels(u'すー')
        >>> b = u'すう'
        >>> a == b
        True
    """
    script_converters = {scripts.Script.Hiragana: lambda x: x,
                         scripts.Script.Katakana: scripts.to_katakana}

    table = kana_table.KanaTable.get_cached()

    out_string = ''
    for segment in scripts.script_boundaries(kana_string):
        if len(segment):
            char_type = scripts.script_type(segment)

            if char_type not in script_converters:
                out_string += segment
                continue

            reverse_operation = script_converters[char_type]
            segment = scripts.to_hiragana(segment)
        else:
            continue

        for m in _long_finder.finditer(segment):
            i = m.start()
            vowel = table.to_vowel_line(segment[i-1])
            segment = segment[:i] + vowel + segment[i+1:]

        out_string += reverse_operation(segment)

    return out_string
示例#7
0
 def _clean_readings(reading_list):
     return set(
             scripts.to_hiragana(r.split('.')[0]) for r in reading_list
         )
示例#8
0
 def _clean_readings(reading_list):
     return set(scripts.to_hiragana(r.split('.')[0]) for r in reading_list)