def replace_alphabet_morphs(li, nabcc=False): # アルファベットまたは記号だけで表記されている語を結合する # 情報処理点字の部分文字列になる記号を前後にまとめる # input: # B,B,記号,アルファベット,*,*,ビー,ビー,1/2,B # asi,asi,名詞,一般,*,*,アシー,アシー,0/3,asi # c,c,記号,アルファベット,*,*,シー,シー,1/2,c # output: # Basic,Basic,名詞,アルファベット,*,*,ビーアシーシー,ビーアシーシー,1/2,Basic new_li = [] alp_morphs = [] for pos in xrange(len(li)): mo = li[pos] if pos < len(li) - 1: next_mo = li[pos + 1] else: next_mo = None if is_alpha_or_single(mo.nhyouki): alp_morphs.append(mo) elif mo.nhyouki and mo.nhyouki in r',+@/#$%&*;<': alp_morphs.append(mo) elif mo.nhyouki == '\\': alp_morphs.append(mo) elif mo.nhyouki and mo.nhyouki[0] in r',+@/#$%&*;' and \ RE_ASCII_SYMBOLS.match(mo.nhyouki): alp_morphs.append(mo) elif alp_morphs and mo.nhyouki in ',.' and \ ((next_mo and next_mo.nhyouki == ' ') or \ (next_mo and next_mo.hinshi1 in ('助詞', '助動詞')) or \ (not next_mo)): alp_morphs.append(mo) elif alp_morphs and mo.nhyouki == ' ' and \ next_mo and is_alpha_or_single(next_mo.nhyouki): alp_morphs.append(mo) elif alp_morphs and mo.nhyouki.isdigit(): alp_morphs.append(mo) elif alp_morphs and mo.nhyouki in ',.:;!?@#\\$%&*|+-/=<>"\'^`_~{}[],': alp_morphs.append(mo) elif nabcc and mo.nhyouki in '”’‘_': alp_morphs.append(mo) elif not alp_morphs and mo.nhyouki in '[]': alp_morphs.append(mo) else: if alp_morphs: m = concatinate_morphs(alp_morphs) m.nhyouki = m.output = unicode_normalize(m.nhyouki) set_pos_of_alphabets(m) new_li.append(m) alp_morphs = [] new_li.append(mo) if alp_morphs: m = concatinate_morphs(alp_morphs) m.nhyouki = m.output = unicode_normalize(m.nhyouki) set_pos_of_alphabets(m) new_li.append(m) return new_li
def getKanaFromRoma(roma): kana = unicode_normalize(roma) if kana in ("youtube",): return None for item in romadic: kana = kana.replace(item[0], item[1]) if all([re.search("[ァ-ヾ]", c) for c in kana]): return kana return None
def replace_morphs(li, dic): new_li = [] for mo in li: if mo.hyouki in dic.keys(): new_morphs = dic[mo.hyouki] for i in new_morphs: m = copy.deepcopy(mo) m.hyouki = i[0] # に m.nhyouki = unicode_normalize(i[0]) # に if i[3]: m.hinshi1 = i[3] if i[4]: m.hinshi2 = i[4] if i[5]: m.hinshi3 = i[5] m.kana = i[0] m.output = m.yomi = unicode_normalize(i[1]) # ニ m.accent = i[2] # 0/1 new_li.append(m) else: new_li.append(mo) return new_li
def replace_digit_morphs(li): # handle digit number kanji characters # input: # 十,名詞,数 # 七,名詞,数 # output: # 十七,名詞,数 # input: # 二,名詞,数 # 十,名詞,数 # 五,名詞,数 # output: # 二十五,名詞,数 # input: # 三,名詞,数,*,*,*,*,三,サン,サン,0/2,C3 # 兆,名詞,数,*,*,*,*,兆,チョウ,チョー,1/2,C3 # 二,名詞,数,*,*,*,*,二,ニ,ニ,1/1,C3 # 千,名詞,数,*,*,*,*,千,セン,セン,1/2,C3 # 四,名詞,数,*,*,*,*,四,ヨン,ヨン,1/2,C1 # 百,名詞,数,*,*,*,*,百,ヒャク,ヒャク,2/2,C3 # 万,名詞,数,*,*,*,*,万,マン,マン,1/2,C3 # output: # 三,三,名詞,数,*,*,サン,サン,,サン,0 # 兆,兆,名詞,数,*,*,チョー,チョー,,チョー,0 # 二千四百,二千四百,名詞,数,*,*,ニセンヨンヒャク,ニセンヨンヒャク,,ニセンヨンヒャク,0 # 万,万,名詞,数,*,*,マン,マン,,マン,0 # (correct: 3チョー 2400マン) new_li = [] num_morphs = [] for mo in li: if mo.hinshi2 == '数' and mo.hyouki == ',' and num_morphs: # カンマ new_li.append(concatinate_morphs(num_morphs)) m = copy.deepcopy(mo) m.yomi = m.output = ',' new_li.append(concatinate_morphs([m])) num_morphs = [] elif mo.hinshi2 == '数' and not mo.output.isdigit() and \ not mo.hyouki in ('・', '万', '億', '兆', '京', '.'): # 漢数字の結合 num_morphs.append(mo) elif mo.hinshi2 == '数' and mo.hyouki in '0123456789': # 算用数字の結合 m = copy.deepcopy(mo) y = unicode_normalize(m.hyouki) m.output = m.hyouki = m.nhyouki = m.yomi = y num_morphs.append(m) else: if num_morphs: new_li.append(concatinate_morphs(num_morphs)) num_morphs = [] new_li.append(mo) if num_morphs: new_li.append(concatinate_morphs(num_morphs)) return new_li
def runTasks(): jtalkPrepare.setup() count = 0 for item in tests: msg = item[0] msg = unicode_normalize(msg) s = jtalkPrepare.convert(msg) if item[1] != s: _print("expected:%s result:%s" % (item[1], s)) count += 1 return count
def runTasks(): jtalkPrepare.setup() count = 0 for item in tests: msg = item[0] normalized = unicode_normalize(msg) s = jtalkPrepare.convert(normalized) if item[1] != s: _print('input:%s normalized:%s result:%s expected:%s' % (msg, normalized, s, item[1])) count += 1 return count
def mecab_to_morphs(mf): li = [] if mf is None or mf.feature is None or mf.size is None: return li for i in xrange(0, mf.size): s = string_at(mf.feature[i]) if s: s = s.decode(CODE, 'ignore') ar = s.split(",") mo = MecabMorph() mo.hyouki = ar[0] mo.nhyouki = unicode_normalize(ar[0]) mo.hinshi1 = ar[1] mo.hinshi2 = ar[2] if len(ar) > 3: mo.hinshi3 = ar[3] mo.hinshi4 = ar[4] if len(ar) > 5: mo.type1 = ar[5] if len(ar) > 6: mo.type2 = ar[6] if len(ar) > 7: mo.kihon = ar[7] if len(ar) > 9: mo.kana = unicode_normalize(ar[8]) # "(ニチ)" -> "(ニチ)" # ありがとうございますー,感動詞,*,*,*,*,*,ありがとうございますー,アリガトウゴザイマスー,アリガトーゴザイマス’ー,0/1,C0 mo.yomi = unicode_normalize(ar[9]).replace("’", "") mo.accent = ar[10] if len(ar) > 12: # Mecab辞書の拡張フィールドの点訳表記があれば使用する mo.output = unicode_normalize(ar[12]) else: mo.output = mo.yomi update_phonetic_symbols(mo) mo.sepflag = False li.append(mo) return li
def Mecab_get_reading(mf, CODE_=CODE): reading = '' braille = '' for pos in xrange(0, mf.size): ar = Mecab_getFeature(mf, pos, CODE_=CODE_).split(',') rd = '' if len(ar) > 9: rd = ar[9].replace('\u3000', ' ') elif ar[0] != 'ー': rd = unicode_normalize(ar[0]) reading += rd if len(ar) > 12: braille += ar[12] + r"/" else: braille += rd + r"/" return (reading, braille.rstrip(r" /"))
def replaceJapaneseFromSpeechSequence(speechSequence): # we don't want to use CharacterMode for replaced Japanese text a = [] charmode = False for item in speechSequence: disableCharMode = False if isinstance(item, basestring): item = unicode_normalize(item) if isJapaneseLang(item): item = replaceJapanese(item) if charmode: disableCharMode = True elif isinstance(item, CharacterModeCommand): cmstate = item.state if disableCharMode: a.append(CharacterModeCommand(False)) a.append(item) if charmode: a.append(CharacterModeCommand(True)) disableCharMode = False else: a.append(item) return a
def japanese_braille_separate(inbuf, logwrite, nabcc=False): text = inbuf if RE_HALF_KATAKANA.match(text): outbuf = text inpos2 = xrange(len(outbuf)) return (outbuf, inpos2) if not nabcc and RE_MB_ALPHA_NUM_SPACE.match(text): outbuf = unicode_normalize(text) inpos2 = xrange(len(outbuf)) return (outbuf, inpos2) if not nabcc and is_gaiji(text) and ' ' in text.rstrip(): rspaces = '' while text[-1] == ' ': rspaces += ' ' text = text[:-1] outbuf = '⠦' + unicode_normalize(text) + '⠴' + rspaces inpos2 = [0] + range(len(outbuf)) inpos2.append(inpos2[-1]) return (outbuf, inpos2) # 'あ゛ー' Unicode 正規化されて空白が入るので事前に補正する text = text.replace('あ゛', 'あ') text = text.replace('ヱ゛', 'ヴェ') text = text.replace('ヲ゛', 'ヴォ') text = text.replace('ワ゛', 'ヴァ') # tab code text = text.replace('\t', TAB_CODE) # 'ふにゃ~' text = text.replace('ゃ~', 'ゃー') text = text2mecab(text) mf = MecabFeatures() Mecab_analysis(text, mf) Mecab_correctFeatures(mf) Mecab_print(mf, logwrite, output_header = False) li = mecab_to_morphs(mf) mf = None for mo in li: if TAB_CODE in mo.nhyouki: mo.hinshi1 = '記号' #mo.hinshi2 = '空白' mo.kana = mo.yomi = mo.output = mo.nhyouki for mo in li: if mo.hinshi1 == '空白': mo.output = ' ' elif mo.hinshi2 == '数' and mo.nhyouki.isdigit(): # digit numbers (not kanji characters) mo.output = mo.nhyouki li = replace_morphs(li, CONNECTED_MORPHS) # before: # たー,たー,助動詞,*,*,*,*,*,たー,ター,ター,1/2,ター,0 # ー,ー,名詞,一般,*,*,*,*,*,,,,,0 # after: # た,た,助動詞,*,*,*,*,*,た,タ,タ,1/2,タ,0 # ー,ー,名詞,一般,*,*,*,*,*,,,,ー,0 # before: 3ー,名詞,数,*,*,*,*,3ー,サンー,サンー,1/3,C0 # after: 3,名詞,数,*,*,*,*,3,サン,サン,1/3,C0 for pos in xrange(len(li) - 1): mo = li[pos] mo2 = li[pos + 1] if 'ー' in mo.hyouki and mo2.hyouki == 'ー': mo.hyouki = mo.kihon = mo.hyouki.replace('ー','') mo.nhyouki = unicode_normalize(mo.hyouki) mo.kana = mo.kana.replace('ー','') mo.yomi = mo.yomi.replace('ー','') if mo.hinshi2 == '数': mo.output = mo.nhyouki else: mo.output = mo.yomi # 動詞のウ音便 # before: # 思う,思う,動詞,自立,*,*,五段・ワ行ウ音便,連用タ接続,思う,オモウ,オモウ,2/3,オモウ,0 # て,て,助詞,接続助詞,*,*,*,*,て,テ,テ,0/1,テ,0 # after: # 思う,思う,動詞,自立,*,*,五段・ワ行ウ音便,連用タ接続,思う,オモウ,オモウ,2/3,オモー,0 # て,て,助詞,接続助詞,*,*,*,*,て,テ,テ,0/1,テ,0 for pos in xrange(len(li) - 1): mo = li[pos] mo2 = li[pos + 1] if mo.hinshi1 == '動詞' and mo.hyouki != '言う' and len(mo.yomi) > 1 and mo.yomi[-1] == 'ウ' and mo2.yomi[:1] in ('タ', 'テ'): mo.output = mo.yomi[:-1] + 'ー' li = replace_digit_morphs(li) li = rewrite_number(li, logwrite) # before: う,う,助動詞,*,*,*,ウ,ウ,0/1,ウ,0 # after: う,う,助動詞,*,*,*,ウ,ウ,0/1,ー,0 for mo in li: if mo.hyouki == 'う' and mo.hinshi1 == '助動詞': mo.output = 'ー' # before: a,a,記号,アルファベット,*,*,エイ,エイ,1/2,エイ,0 # after: a,a,記号,アルファベット,*,*,エイ,エイ,1/2,a,0 for mo in li: if mo.hinshi2 == 'アルファベット': mo.output = mo.nhyouki li = replace_alphabet_morphs(li, nabcc=nabcc) for mo in li: if mo.hyouki == '〝': mo.hinshi1 = '記号' mo.hinshi2 = '括弧開' if mo.hyouki == '〟': mo.hinshi1 = '記号' mo.hinshi2 = '括弧閉' if mo.hyouki == '々々々々': mo.hinshi1 = '記号' mo.hinshi2 = '一般' if mo.hyouki == '〻': # 303b 二の字点(にのじてん) mo.hinshi1 = '記号' mo.hinshi2 = '一般' for mo in li: if mo.hinshi2 in ('括弧開', '括弧閉'): mo.output = mo.nhyouki # before: , ,記号,空白,*,*, , ,*/*, ,0 # after: , ,記号,空白,*,*, , ,*/*, ,0 for mo in li: if mo.hyouki == ' ': # full shape space mo.output = ' ' # before: ー,ー,名詞,一般,*,*,*,*,*,,,,,0 # after: ー,ー,名詞,一般,*,*,*,*,*,,,,ー,0 for mo in li: if mo.hyouki == 'ー' and mo.hinshi1 == '名詞': mo.hinshi1 = '記号' mo.output = 'ー' # 数字の前の全角アポストロフィを半角にする # before: # ’,’,記号,括弧閉,*,*,’,’,*/*,’,0 # 0,0,名詞,数,*,*,ゼロ,ゼロ,1/2,0,0 # after: # ’,’,記号,括弧閉,*,*,’,’,*/*,',0 # 0,0,名詞,数,*,*,ゼロ,ゼロ,1/2,0,0 for pos in xrange(0, len(li) - 1): if li[pos].hyouki == '’' and li[pos+1].hinshi2 == '数': li[pos].output = "'" # 算用数字ではさまれた読点と中点を数符にする # before: # 二,二,名詞,数,*,*,2,2,1/2,2,0 # 、,、,記号,読点,*,*,、,、,*/*,、,0 # 三,三,名詞,数,*,*,3,3,1/2,3,0 # after: # 二,二,名詞,数,*,*,2,2,1/2,2,0 # 、,、,記号,読点,*,*,、,、,*/*,⠼,0 # 三,三,名詞,数,*,*,3,3,1/2,3,0 for pos in xrange(1, len(li) - 1): if li[pos-1].output.isdigit() and \ li[pos].hyouki in ('、', '・') and \ li[pos+1].output.isdigit(): if nabcc: li[pos].output = '.' else: li[pos].output = '⠼' # before: ab,ab,名詞,一般,*,*,アブ,アブ,1/2,アブ,0 # after: ab,ab,名詞,一般,*,*,アブ,アブ,1/2,ab,0 # before: No.,No.,接頭詞,数接続,*,*,ナンバー,ナンバー,1/4,ナンバー,0 # after: No.,No.,接頭詞,数接続,*,*,ナンバー,ナンバー,1/4,No.,0 for mo in li: if RE_ASCII_CHARS.match(mo.nhyouki): mo.output = mo.nhyouki # before: ヒロイノ,ヒロイノ,名詞,一般,*,*,,,,,0 # after: ヒロイノ,ヒロイノ,名詞,一般,*,*,,,,ヒロイノ,0 # before: ィ,ィ,名詞,一般,*,*,,,,,0 # after: ィ,ィ,名詞,一般,*,*,,,,ィ,0 # before: ぁ,ぁ,名詞,一般,*,*,,,,,0 # after: ぁ,ぁ,名詞,一般,*,*,,,,ァ,0 for mo in li: if not mo.output and mo.nhyouki != 'ー': if RE_KATAKANA.match(mo.nhyouki): mo.output = mo.nhyouki elif RE_HIRAGANA.match(mo.nhyouki): mo.output = ''.join([unichr(ord(c) + 0x60) for c in mo.nhyouki]) # 単語が小文字カタカナのみであれば修正 # 表記は修正せず should_separate() で小文字として判定される for mo in li: if mo.output == 'ァ': mo.output = 'ア' if mo.output == 'ィ': mo.output = 'イ' if mo.output == 'ゥ': mo.output = 'ウ' if mo.output == 'ェ': mo.output = 'エ' if mo.output == 'ォ': mo.output = 'オ' if mo.output == 'ッ': mo.output = 'ツ' if mo.output == 'ャ': mo.output = 'ヤ' if mo.output == 'ュ': mo.output = 'ユ' if mo.output == 'ョ': mo.output = 'ヨ' if mo.output == 'ヮ': mo.output = 'ワ' if mo.output == 'ヵ': mo.output = 'カ' if mo.output == 'ヶ': mo.output = 'ケ' # 記号を Unicode 正規化 # 踊り字の処理 for i in xrange(0, len(li)): mo = li[i] if mo.hinshi1 == '記号' and mo.hinshi2 == '一般': if mo.hyouki == '〻': mo.output = 'ニノジテン' elif mo.hyouki == 'ゝ' and i > 0: mo.output = to_no_dakuon_kana(li[i-1].output[-1:]) elif mo.hyouki == 'ゞ' and i > 0: mo.output = to_dakuon_kana(li[i-1].output[-1:]) elif mo.hyouki == 'ヽ' and i > 0: mo.output = to_no_dakuon_kana(li[i-1].output[-1:]) elif mo.hyouki == 'ヾ' and i > 0: mo.output = to_dakuon_kana(li[i-1].output[-1:]) elif mo.hyouki == '々々々々' and i > 0: mo.output = li[i-1].output * 4 elif mo.hyouki == '々々' and i > 0: mo.output = li[i-1].output * 2 elif mo.hyouki == '々' and i > 0: if li[i-1].hyouki[0] == '々' and i > 1: mo.output = li[i-2].output elif len(li[i-1].hyouki) == 1: mo.output = li[i-1].output else: mo.output = '' # FIXME else: mo.output = mo.nhyouki if mo.hyouki == '.' and mo.hinshi1 == '名詞' and mo.hinshi2 == '数': mo.output = '.' if mo.hyouki == ',' and mo.hinshi1 == '名詞' and mo.hinshi2 == '数': mo.output = ',' if mo.hinshi1 == '記号' and mo.hinshi2 == '句点' and mo.nhyouki == '.': mo.output = '.' if mo.hinshi1 == '記号' and mo.hinshi2 == '読点' and mo.nhyouki == ',': mo.output = ',' for mo in li: # 情報処理点字の開始記号と終了記号 if RE_INFOMATION.match(mo.nhyouki) and \ ('@' in mo.nhyouki) or ('://' in mo.nhyouki) or ('\\' in mo.nhyouki): if nabcc: mo.output = mo.nhyouki else: mo.output = '⠠⠦' + mo.nhyouki + '⠠⠴' # 外国語引用符 # 空白をはさまない1単語は外国語引用符ではなく外字符で elif ( RE_GAIJI.match(mo.nhyouki) and \ ((' ' in mo.nhyouki) or ("'" in mo.nhyouki)) ) or ( ('.' in mo.nhyouki) and \ len(mo.nhyouki) > 3 ): if nabcc: mo.output = mo.nhyouki else: mo.output = '⠦' + mo.nhyouki + '⠴' if not nabcc: for mo in li: # 情報処理点字でも外国語引用符でもなく output が & を含む場合は前後をあける if not mo.output.startswith('⠠⠦') and not mo.output.startswith('⠦'): # & if mo.output == '&': continue # &xx elif mo.output.startswith('&'): mo.output = mo.output.replace('&', '& ') # xx& elif mo.output.endswith('&'): mo.output = mo.output.replace('&', ' &') # xx&xx else: mo.output = mo.output.replace('&', ' & ') if nabcc: for mo in li: mo.output = mo.output.replace('”', '"').replace('’', "'").replace('‘', '`') # 日付の和語読み処理 li = fix_japanese_date_morphs(li) # 日本語の直後のコンマを '、' で解釈 # before: ,,記号,読点,*,*,*,*,,,,,,,*/*,* # after: 、,記号,読点,*,*,*,*,、,、,、,*/*,* for pos in xrange(len(li) - 1): mo = li[pos] mo2 = li[pos + 1] if mo2.hyouki == ',' and not ( mo.hinshi2 in ('アルファベット', '数', '括弧閉') ): mo2.hyouki = mo2.nhyouki = mo2.output = '、' # 分かち書き判定 for i in xrange(1, len(li)): prev2_mo = li[i-2] if i-2 >= 0 else None prev_mo = li[i-1] next_mo = li[i+1] if i+1 < len(li) else None li[i-1].sepflag = should_separate(prev2_mo, prev_mo, li[i], next_mo, nabcc=nabcc, logwrite=logwrite) # do not translate if string is unicode braille for i in xrange(0, len(li)): mo = li[i] if all((0x2800 <= ord(c) <= 0x28ff or c == '\u3000') for c in mo.hyouki): mo.output = mo.hyouki.replace('\u3000', ' ') mo.sepflag = False if i > 0: li[i-1].sepflag = False for mo in li: mo.write(logwrite) logwrite('') outbuf, inpos2 = morphs_to_string(li, inbuf, logwrite) if nabcc: outbuf = outbuf.replace(TAB_CODE, '⡀') else: outbuf = outbuf.replace(TAB_CODE, ' ') return (outbuf, inpos2)