예제 #1
0
    def train_data():
        count = 0
        with open(DATA_PATH, 'r') as f:
            for line in f:
                if random.random() > 0.01:
                    continue
                count += 1
                line = line.strip()
                words = list(
                    filter(lambda w: w.strip() != '', line.split('\t')))
                words = list(
                    filter(
                        lambda w: all(
                            any(c in t for t in tone.tone_types.values())
                            for c in w.split()[1]), words))
                if len(words) == 0:
                    continue

                t = []
                ws = []
                for w in words:
                    tones, kanas = tone.convert_tones(w.split()[1])
                    if len(tones) == 0:
                        continue
                    tones[-1] = kanas[-1]
                    t.extend(tones)
                    ws.append(w.split()[0])
                if len(t) != 0:
                    yield t, ws
예제 #2
0
def _create_tone_list():
    """Return tone to string dictionary.

    Return:
        tone_list (Hash[String, List[String]]): tone to string dictionary.
    """
    def train_data():
        with open(DATA_PATH, 'r') as f:
            for line in f:
                line = line.strip()
                words = line.split('\t')
                words = filter(lambda w: w.strip() != '', words)
                yield from words

    def train_data_2gram():
        g = graph.Graph()
        with open(DATA_PATH, 'r') as f:
            for line in f:
                line = line.strip()
                words = line.split('\t')
                words = list(filter(lambda w: w.strip() != '', words))
                words = [' '.join([g.BOS.word] * 3)
                         ] + words + [' '.join([g.EOS.word] * 3)]
                yield from [
                    words[i].split()[2] + '_' + words[i + 1].split()[2]
                    for i in range(len(words) - 1)
                ]

    lcounter = counter.LossyCounter(epsilon=1e-6)
    lcounter.count(train_data())
    lcounter_2gram = counter.LossyCounter(epsilon=1e-7)
    lcounter_2gram.count(train_data_2gram())
    print(len(lcounter._items))
    print(len(lcounter_2gram._items))

    tone_list = {}
    word2pos = {}
    count = 0
    for w in lcounter._items:
        chars = w.split()[1]
        tones, kana = tone.convert_tones(chars)

        if len(tones) == 0:
            count += 1
            continue
        if len(tones) != len(kana):
            count += 1
            continue

        word = w.split()[0]
        if word not in word2pos:
            word2pos[word] = w.split()[2]
            for t in _mix_tone_and_kana(tones, kana):
                if t not in tone_list:
                    tone_list[t] = []
                tone_list[t].append(word)
    print("Remove Count:", count)
    print('Total Count:', sum(1 for t in tone_list for l in tone_list[t]))
    return tone_list, lcounter_2gram, word2pos
예제 #3
0
def _process_syntax(line):
    """Return kana and pronounce list for input line.

    Args:
        line (String): parse target strings

    Return:
        result (String): kana1 + ' ' + pronounce1 + '\t' + kana2 + ' ' + pronounce2 ... sforamted string.
    """
    mecab.tagger = mecab.MeCab.Tagger("-d ./lib")
    sentence = mecab.parse(line)
    result = []
    ret_kana = ""
    ret_pronounce = ""
    ret_pos = ""
    for word in sentence.words:
        if word.pos == '記号':
            continue

        if len(tone.convert_tones(word.pronounce)[0]) == 0:
            result.append(ret_kana + " " + ret_pronounce + " " + ret_pos)
            ret_kana = ""
            ret_pronounce = ""
            ret_pos = ""
            continue
        elif ret_pos.startswith('動詞') and word.pos not in [
                '接続詞', '形容詞', '動詞', '名詞', '連体詞', '副詞'
        ]:
            ret_kana += word.surface
            ret_pronounce += word.pronounce
            if ret_pos == '':
                ret_pos = word.pos
        else:
            result.append(ret_kana + " " + ret_pronounce + " " + ret_pos)
            ret_kana = word.surface
            ret_pronounce = word.pronounce
            if word.pos == '名詞':
                ret_pos = word.pos + '-' + word.pos1
            elif word.pos == '助詞':
                ret_pos = word.pos + '-' + word.pos1
            elif word.pos == '動詞':
                ret_pos = word.pos + '-' + word.pos1
            else:
                ret_pos = word.pos
    result.append(ret_kana + " " + ret_pronounce + " " + ret_pos)
    return "\t".join(r for r in result if r != "")
예제 #4
0
def get_match_word(yomi, tone_list):
    """Return tone match words to word.

    Aarg:
        yomi (str): target word yomi.
        tone_list (str): tone dictionary.
    Return:
        words (List[String]): match word list.
    """
    tones, _ = tone.convert_tones(yomi)
    tones = "".join(tones)

    distances = [(max(measure_tail_match_num(tones, t),
                      measure_initial_match_num(tones, t)), t)
                 for t in tone_list if len(tones) == len(t)]

    distance = sorted(distances, key=lambda x: x[0], reverse=True)[0]
    return tone_list[distance[1]]
예제 #5
0
def get_match_word_with_searcher(yomi, tone_list, prefix_searcher):
    """Return tone match words with common prefix searcher.

    Aarg:
        yomi (str): target word yomi.
        tone_list (str): tone dictionary.
        prefix_searcher (TrieBase): Trie prefix searcher class
    Return:
        words (List[String]): match word list.
    """
    tones, _ = tone.convert_tones(yomi)
    N = len(tones)
    result = []
    while len(tones) != 0:
        result = prefix_searcher.search(tones, max_len=N)
        if len(result) != 0:
            break
        tones = tones[:-1]

    if len(result) == 0:
        return []
    return tone_list[random.choice(result)]
예제 #6
0
def generate_rapv2(s, tone_list, prefix_searcher, learner, N=1):
    """Return generated rap.

    Aarg:
        s (String): target sentence.
        tone_list (Hash[Tuple[String], List[String]]): string to string dictionary.
        prefix_searcher (TrieBase): Trie Prefix Searcher class
        learner (StructuredLearner): pre-trained structured learner
        N (Int): response numbers.
    Return:
        rap (List[String]): generated rap
    """
    t = []
    is_last_tone = False
    for w in reversed(mecab.parse(s).words):
        tones, kana = tone.convert_tones(w.pronounce)
        if len(tones) == 0:
            continue
        if not is_last_tone:
            tones[-1] = kana[-1]
        else:
            is_last_tone = False
        if (w.pos != '名詞' and w.pos != '形容詞'
                and w.pos != '動詞') and len(tones) == 1:
            is_last_tone = True
        t += reversed(tones)
    t = list(reversed(t))
    g = graph.Graph.construct_graph(prefix_searcher, tone_list, t)
    g.learner = learner
    try:
        if N != 1:
            paths = g.search_nbest_path(N)
        else:
            paths = [g.search_shortest_path()]
    except graph.SearchShortestPathError:
        return ""
    return ["".join(p.word for p in path[:-1]) for path in paths]
예제 #7
0
def testconvert_tones():
    eq_((['e', 'i', 'a', 'i'], ['セ', 'イ', 'タ', 'イ']),
        tone.convert_tones('セイタイ'))
    eq_((['a', 'u', 'xtu'], ['ヤ', 'ブ', 'ッ']), tone.convert_tones('ヤブッ'))
    eq_((['a', 'i', 'o', 'u', 'i', 'a'], ['カ', 'リ', 'フォ', 'ル', 'ニ', 'ア']),
        tone.convert_tones('カリフォルニア'))
    eq_((['o', 'u', 'e', 'u'], ['チョ', 'ウ', 'セ', 'ツ']),
        tone.convert_tones('チョウセツ'))
    eq_((['o', 'o', 'a', 'e', 'i', 'o',
          'u'], ['ロ', 'ー', 'マ', 'テ', 'イ', 'コ', 'ク']),
        tone.convert_tones('ローマテイコク'))
    eq_(([], []), tone.convert_tones('、'))
    eq_((['a', 'n', 'a', 'n'], ['カ', 'ン', 'タ', 'ン']),
        tone.convert_tones('カンタン'))
    eq_((['a', 'a', 'a'], ['カ', 'ラ', 'ー']), tone.convert_tones('カラー'))
    eq_(([], []), tone.convert_tones('ー'))
    eq_((['a', 'n', 'a', 'i', 'a'], ['ヴァ', 'ン', 'パ', 'イ', 'ア']),
        tone.convert_tones('ヴァンパイア'))
    eq_((['o'], ['ョ']), tone.convert_tones('ョ'))
예제 #8
0
def testconvert_tones_error_case():
    eq_(([], []), tone.convert_tones('aaaa'))