Пример #1
0
def replace_alphabet_morphs(li, nabcc=False):
	# アルファベットまたは記号だけで表記されている語を結合する
	# 情報処理点字の部分文字列になる記号を前後にまとめる
	# input:
	#  B,B,記号,アルファベット,*,*,ビー,ビー,1/2,B
	#  asi,asi,名詞,一般,*,*,アシー,アシー,0/3,asi
	#  c,c,記号,アルファベット,*,*,シー,シー,1/2,c
	# output:
	#  Basic,Basic,名詞,アルファベット,*,*,ビーアシーシー,ビーアシーシー,1/2,Basic
	new_li = []
	alp_morphs = []
	for pos in xrange(len(li)):
		mo = li[pos]
		if pos < len(li) - 1:
			next_mo = li[pos + 1]
		else:
			next_mo = None
		if is_alpha_or_single(mo.nhyouki):
			alp_morphs.append(mo)
		elif mo.nhyouki and mo.nhyouki in r',+@/#$%&*;<':
			alp_morphs.append(mo)
		elif mo.nhyouki == '\\':
			alp_morphs.append(mo)
		elif mo.nhyouki and mo.nhyouki[0] in r',+@/#$%&*;' and \
				RE_ASCII_SYMBOLS.match(mo.nhyouki):
			alp_morphs.append(mo)
		elif alp_morphs and mo.nhyouki in ',.' and \
				((next_mo and next_mo.nhyouki == ' ') or \
					 (next_mo and next_mo.hinshi1 in ('助詞', '助動詞')) or \
					 (not next_mo)):
			alp_morphs.append(mo)
		elif alp_morphs and mo.nhyouki == ' ' and \
				next_mo and is_alpha_or_single(next_mo.nhyouki):
			alp_morphs.append(mo)
		elif alp_morphs and mo.nhyouki.isdigit():
			alp_morphs.append(mo)
		elif alp_morphs and mo.nhyouki in ',.:;!?@#\\$%&*|+-/=<>"\'^`_~{}[],':
			alp_morphs.append(mo)
		elif nabcc and mo.nhyouki in '”’‘_':
			alp_morphs.append(mo)
		elif not alp_morphs and mo.nhyouki in '[]':
			alp_morphs.append(mo)
		else:
			if alp_morphs:
				m = concatinate_morphs(alp_morphs)
				m.nhyouki = m.output = unicode_normalize(m.nhyouki)	
				set_pos_of_alphabets(m)
				new_li.append(m)
				alp_morphs = []
			new_li.append(mo)
	if alp_morphs:
		m = concatinate_morphs(alp_morphs)
		m.nhyouki = m.output = unicode_normalize(m.nhyouki)
		set_pos_of_alphabets(m)
		new_li.append(m)
	return new_li
Пример #2
0
def getKanaFromRoma(roma):
    kana = unicode_normalize(roma)
    if kana in ("youtube",):
        return None
    for item in romadic:
        kana = kana.replace(item[0], item[1])
    if all([re.search("[ァ-ヾ]", c) for c in kana]):
        return kana
    return None
Пример #3
0
def replace_morphs(li, dic):
	new_li = []
	for mo in li:
		if mo.hyouki in dic.keys():
			new_morphs = dic[mo.hyouki]
			for i in new_morphs:
				m = copy.deepcopy(mo)
				m.hyouki = i[0] # に
				m.nhyouki = unicode_normalize(i[0]) # に
				if i[3]: m.hinshi1 = i[3]
				if i[4]: m.hinshi2 = i[4]
				if i[5]: m.hinshi3 = i[5]
				m.kana = i[0]
				m.output = m.yomi = unicode_normalize(i[1]) # ニ
				m.accent = i[2] # 0/1
				new_li.append(m)
		else:
			new_li.append(mo)
	return new_li
Пример #4
0
def replace_digit_morphs(li):
	# handle digit number kanji characters
	# input:
	#  十,名詞,数
	#  七,名詞,数
	# output:
	#  十七,名詞,数
	# input:
	#  二,名詞,数
	#  十,名詞,数
	#  五,名詞,数
	# output:
	#  二十五,名詞,数
	# input:
	#  三,名詞,数,*,*,*,*,三,サン,サン,0/2,C3
	#  兆,名詞,数,*,*,*,*,兆,チョウ,チョー,1/2,C3
	#  二,名詞,数,*,*,*,*,二,ニ,ニ,1/1,C3
	#  千,名詞,数,*,*,*,*,千,セン,セン,1/2,C3
	#  四,名詞,数,*,*,*,*,四,ヨン,ヨン,1/2,C1
	#  百,名詞,数,*,*,*,*,百,ヒャク,ヒャク,2/2,C3
	#  万,名詞,数,*,*,*,*,万,マン,マン,1/2,C3
	# output:
	#  三,三,名詞,数,*,*,サン,サン,,サン,0
	#  兆,兆,名詞,数,*,*,チョー,チョー,,チョー,0
	#  二千四百,二千四百,名詞,数,*,*,ニセンヨンヒャク,ニセンヨンヒャク,,ニセンヨンヒャク,0
	#  万,万,名詞,数,*,*,マン,マン,,マン,0
	# (correct: 3チョー 2400マン)
	new_li = []
	num_morphs = []
	for mo in li:
		if mo.hinshi2 == '数' and mo.hyouki == ',' and num_morphs:
			# カンマ
			new_li.append(concatinate_morphs(num_morphs))
			m = copy.deepcopy(mo)
			m.yomi = m.output = ','
			new_li.append(concatinate_morphs([m]))
			num_morphs = []
		elif mo.hinshi2 == '数' and not mo.output.isdigit() and \
				not mo.hyouki in ('・', '万', '億', '兆', '京', '.'):
			# 漢数字の結合
			num_morphs.append(mo)
		elif mo.hinshi2 == '数' and mo.hyouki in '0123456789':
			# 算用数字の結合
			m = copy.deepcopy(mo)
			y = unicode_normalize(m.hyouki)
			m.output = m.hyouki = m.nhyouki = m.yomi = y
			num_morphs.append(m)
		else:
			if num_morphs:
				new_li.append(concatinate_morphs(num_morphs))
				num_morphs = []
			new_li.append(mo)
	if num_morphs:
		new_li.append(concatinate_morphs(num_morphs))
	return new_li
Пример #5
0
def runTasks():
    jtalkPrepare.setup()
    count = 0
    for item in tests:
        msg = item[0]
        msg = unicode_normalize(msg)
        s = jtalkPrepare.convert(msg)
        if item[1] != s:
            _print("expected:%s result:%s" % (item[1], s))
            count += 1
    return count
Пример #6
0
def runTasks():
	jtalkPrepare.setup()
	count = 0
	for item in tests:
		msg = item[0]
		normalized = unicode_normalize(msg)
		s = jtalkPrepare.convert(normalized)
		if item[1] != s:
			_print('input:%s normalized:%s result:%s expected:%s' % (msg, normalized, s, item[1]))
			count += 1
	return count
Пример #7
0
def mecab_to_morphs(mf):
	li = []
	if mf is None or mf.feature is None or mf.size is None: 
		return li
	for i in xrange(0, mf.size):
		s = string_at(mf.feature[i])
		if s:
			s = s.decode(CODE, 'ignore')
			ar = s.split(",")
			mo = MecabMorph()
			mo.hyouki = ar[0]
			mo.nhyouki = unicode_normalize(ar[0])
			mo.hinshi1 = ar[1]
			mo.hinshi2 = ar[2]
			if len(ar) > 3:
				mo.hinshi3 = ar[3]
				mo.hinshi4 = ar[4]
			if len(ar) > 5:
				mo.type1 = ar[5]
			if len(ar) > 6:
				mo.type2 = ar[6]
			if len(ar) > 7:
				mo.kihon = ar[7]
			if len(ar) > 9:
				mo.kana = unicode_normalize(ar[8]) # "(ニチ)" -> "(ニチ)"
				# ありがとうございますー,感動詞,*,*,*,*,*,ありがとうございますー,アリガトウゴザイマスー,アリガトーゴザイマス’ー,0/1,C0
				mo.yomi = unicode_normalize(ar[9]).replace("’", "")
				mo.accent = ar[10]
				if len(ar) > 12:
					# Mecab辞書の拡張フィールドの点訳表記があれば使用する
					mo.output = unicode_normalize(ar[12])
				else:
					mo.output = mo.yomi
					update_phonetic_symbols(mo)
			mo.sepflag = False
			li.append(mo)
	return li
Пример #8
0
def Mecab_get_reading(mf, CODE_=CODE):
	reading = ''
	braille = ''
	for pos in xrange(0, mf.size):
		ar = Mecab_getFeature(mf, pos, CODE_=CODE_).split(',')
		rd = ''
		if len(ar) > 9:
			rd = ar[9].replace('\u3000', ' ')
		elif ar[0] != 'ー':
			rd = unicode_normalize(ar[0])
		reading += rd
		if len(ar) > 12:
			braille += ar[12] + r"/"
		else:
			braille += rd + r"/"
	return (reading, braille.rstrip(r" /"))
Пример #9
0
def replaceJapaneseFromSpeechSequence(speechSequence):
	# we don't want to use CharacterMode for replaced Japanese text
	a = []
	charmode = False
	for item in speechSequence:
		disableCharMode = False
		if isinstance(item, basestring):
			item = unicode_normalize(item)
			if isJapaneseLang(item):
				item = replaceJapanese(item)
				if charmode:
					disableCharMode = True
		elif isinstance(item, CharacterModeCommand):
			cmstate = item.state
		if disableCharMode:
			a.append(CharacterModeCommand(False))
			a.append(item)
			if charmode:
				a.append(CharacterModeCommand(True))
			disableCharMode = False
		else:
			a.append(item)
	return a
Пример #10
0
def japanese_braille_separate(inbuf, logwrite, nabcc=False):
	text = inbuf
	if RE_HALF_KATAKANA.match(text):
		outbuf = text
		inpos2 = xrange(len(outbuf))
		return (outbuf, inpos2)

	if not nabcc and RE_MB_ALPHA_NUM_SPACE.match(text):
		outbuf = unicode_normalize(text)
		inpos2 = xrange(len(outbuf))
		return (outbuf, inpos2)

	if not nabcc and is_gaiji(text) and ' ' in text.rstrip():
		rspaces = ''
		while text[-1] == ' ':
			rspaces += ' '
			text = text[:-1]
		outbuf = '⠦' + unicode_normalize(text) + '⠴' + rspaces
		inpos2 = [0] + range(len(outbuf))
		inpos2.append(inpos2[-1])
		return (outbuf, inpos2)

	# 'あ゛ー' Unicode 正規化されて空白が入るので事前に補正する
	text = text.replace('あ゛', 'あ')
	text = text.replace('ヱ゛', 'ヴェ')
	text = text.replace('ヲ゛', 'ヴォ')
	text = text.replace('ワ゛', 'ヴァ')

	# tab code
	text = text.replace('\t', TAB_CODE)

	# 'ふにゃ~'
	text = text.replace('ゃ~', 'ゃー')

	text = text2mecab(text)
	mf = MecabFeatures()
	Mecab_analysis(text, mf)
	Mecab_correctFeatures(mf)
	Mecab_print(mf, logwrite, output_header = False)
	li = mecab_to_morphs(mf)
	mf = None

	for mo in li:
		if TAB_CODE in mo.nhyouki:
			mo.hinshi1 = '記号'
			#mo.hinshi2 = '空白'
			mo.kana = mo.yomi = mo.output = mo.nhyouki

	for mo in li:
		if mo.hinshi1 == '空白':
			mo.output = ' '
		elif mo.hinshi2 == '数' and mo.nhyouki.isdigit():
			# digit numbers (not kanji characters)
			mo.output = mo.nhyouki

	li = replace_morphs(li, CONNECTED_MORPHS)

	# before:
	# たー,たー,助動詞,*,*,*,*,*,たー,ター,ター,1/2,ター,0
	# ー,ー,名詞,一般,*,*,*,*,*,,,,,0
	# after:
	# た,た,助動詞,*,*,*,*,*,た,タ,タ,1/2,タ,0
	# ー,ー,名詞,一般,*,*,*,*,*,,,,ー,0

	# before: 3ー,名詞,数,*,*,*,*,3ー,サンー,サンー,1/3,C0
	# after:  3,名詞,数,*,*,*,*,3,サン,サン,1/3,C0
	for pos in xrange(len(li) - 1):
		mo = li[pos]
		mo2 = li[pos + 1]
		if 'ー' in mo.hyouki and mo2.hyouki == 'ー':
			mo.hyouki = mo.kihon = mo.hyouki.replace('ー','')
			mo.nhyouki = unicode_normalize(mo.hyouki)
			mo.kana = mo.kana.replace('ー','')
			mo.yomi = mo.yomi.replace('ー','')
			if mo.hinshi2 == '数':
				mo.output = mo.nhyouki
			else:
				mo.output = mo.yomi

	# 動詞のウ音便
	# before:
	# 思う,思う,動詞,自立,*,*,五段・ワ行ウ音便,連用タ接続,思う,オモウ,オモウ,2/3,オモウ,0
	# て,て,助詞,接続助詞,*,*,*,*,て,テ,テ,0/1,テ,0
	# after:
	# 思う,思う,動詞,自立,*,*,五段・ワ行ウ音便,連用タ接続,思う,オモウ,オモウ,2/3,オモー,0
	# て,て,助詞,接続助詞,*,*,*,*,て,テ,テ,0/1,テ,0
	for pos in xrange(len(li) - 1):
		mo = li[pos]
		mo2 = li[pos + 1]
		if mo.hinshi1 == '動詞' and mo.hyouki != '言う' and len(mo.yomi) > 1 and mo.yomi[-1] == 'ウ' and mo2.yomi[:1] in ('タ', 'テ'):
			mo.output = mo.yomi[:-1] + 'ー'

	li = replace_digit_morphs(li)
	li = rewrite_number(li, logwrite)

	# before: う,う,助動詞,*,*,*,ウ,ウ,0/1,ウ,0
	# after:  う,う,助動詞,*,*,*,ウ,ウ,0/1,ー,0
	for mo in li:
		if mo.hyouki == 'う' and mo.hinshi1 == '助動詞':
			mo.output = 'ー'

	# before: a,a,記号,アルファベット,*,*,エイ,エイ,1/2,エイ,0
	# after:  a,a,記号,アルファベット,*,*,エイ,エイ,1/2,a,0
	for mo in li:
		if mo.hinshi2 == 'アルファベット':
			mo.output = mo.nhyouki

	li = replace_alphabet_morphs(li, nabcc=nabcc)

	for mo in li:
		if mo.hyouki == '〝':
			mo.hinshi1 = '記号'
			mo.hinshi2 = '括弧開'
		if mo.hyouki == '〟':
			mo.hinshi1 = '記号'
			mo.hinshi2 = '括弧閉'
		if mo.hyouki == '々々々々':
			mo.hinshi1 = '記号'
			mo.hinshi2 = '一般'
		if mo.hyouki == '〻':
			# 303b 二の字点(にのじてん)
			mo.hinshi1 = '記号'
			mo.hinshi2 = '一般'

	for mo in li:
		if mo.hinshi2 in ('括弧開', '括弧閉'):
			mo.output = mo.nhyouki

	# before:  , ,記号,空白,*,*, , ,*/*, ,0
	# after:   , ,記号,空白,*,*, , ,*/*, ,0
	for mo in li:
		if mo.hyouki == ' ': # full shape space
			mo.output = ' '

	# before: ー,ー,名詞,一般,*,*,*,*,*,,,,,0
	# after:  ー,ー,名詞,一般,*,*,*,*,*,,,,ー,0
	for mo in li:
		if mo.hyouki == 'ー' and mo.hinshi1 == '名詞':
			mo.hinshi1 = '記号'
			mo.output = 'ー'

	# 数字の前の全角アポストロフィを半角にする
	# before:
	# ’,’,記号,括弧閉,*,*,’,’,*/*,’,0
	# 0,0,名詞,数,*,*,ゼロ,ゼロ,1/2,0,0
	# after:
	# ’,’,記号,括弧閉,*,*,’,’,*/*,',0
	# 0,0,名詞,数,*,*,ゼロ,ゼロ,1/2,0,0
	for pos in xrange(0, len(li) - 1):
		if li[pos].hyouki == '’' and li[pos+1].hinshi2 == '数':
			li[pos].output = "'"

	# 算用数字ではさまれた読点と中点を数符にする
	# before:
	# 二,二,名詞,数,*,*,2,2,1/2,2,0
	# 、,、,記号,読点,*,*,、,、,*/*,、,0
	# 三,三,名詞,数,*,*,3,3,1/2,3,0
	# after:
	# 二,二,名詞,数,*,*,2,2,1/2,2,0
	# 、,、,記号,読点,*,*,、,、,*/*,⠼,0
	# 三,三,名詞,数,*,*,3,3,1/2,3,0
	for pos in xrange(1, len(li) - 1):
		if li[pos-1].output.isdigit() and \
				li[pos].hyouki in ('、', '・') and \
				li[pos+1].output.isdigit():
			if nabcc:
				li[pos].output = '.'
			else:
				li[pos].output = '⠼'

	# before: ab,ab,名詞,一般,*,*,アブ,アブ,1/2,アブ,0
	# after:  ab,ab,名詞,一般,*,*,アブ,アブ,1/2,ab,0
	# before: No.,No.,接頭詞,数接続,*,*,ナンバー,ナンバー,1/4,ナンバー,0
	# after:  No.,No.,接頭詞,数接続,*,*,ナンバー,ナンバー,1/4,No.,0
	for mo in li:
		if RE_ASCII_CHARS.match(mo.nhyouki):
			mo.output = mo.nhyouki

	# before: ヒロイノ,ヒロイノ,名詞,一般,*,*,,,,,0
	# after:  ヒロイノ,ヒロイノ,名詞,一般,*,*,,,,ヒロイノ,0
	# before: ィ,ィ,名詞,一般,*,*,,,,,0
	# after:  ィ,ィ,名詞,一般,*,*,,,,ィ,0
	# before: ぁ,ぁ,名詞,一般,*,*,,,,,0
	# after:  ぁ,ぁ,名詞,一般,*,*,,,,ァ,0
	for mo in li:
		if not mo.output and mo.nhyouki != 'ー':
			if RE_KATAKANA.match(mo.nhyouki):
				mo.output = mo.nhyouki
			elif RE_HIRAGANA.match(mo.nhyouki):
				mo.output = ''.join([unichr(ord(c) + 0x60) for c in mo.nhyouki])

	# 単語が小文字カタカナのみであれば修正
	# 表記は修正せず should_separate() で小文字として判定される
	for mo in li:
		if mo.output == 'ァ': mo.output = 'ア'
		if mo.output == 'ィ': mo.output = 'イ'
		if mo.output == 'ゥ': mo.output = 'ウ'
		if mo.output == 'ェ': mo.output = 'エ'
		if mo.output == 'ォ': mo.output = 'オ'
		if mo.output == 'ッ': mo.output = 'ツ'
		if mo.output == 'ャ': mo.output = 'ヤ'
		if mo.output == 'ュ': mo.output = 'ユ'
		if mo.output == 'ョ': mo.output = 'ヨ'
		if mo.output == 'ヮ': mo.output = 'ワ'
		if mo.output == 'ヵ': mo.output = 'カ'
		if mo.output == 'ヶ': mo.output = 'ケ'

	# 記号を Unicode 正規化
	# 踊り字の処理
	for i in xrange(0, len(li)):
		mo = li[i]
		if mo.hinshi1 == '記号' and mo.hinshi2 == '一般':
			if mo.hyouki == '〻':
				mo.output = 'ニノジテン'
			elif mo.hyouki == 'ゝ' and i > 0:
				mo.output = to_no_dakuon_kana(li[i-1].output[-1:])
			elif mo.hyouki == 'ゞ' and i > 0:
				mo.output = to_dakuon_kana(li[i-1].output[-1:])
			elif mo.hyouki == 'ヽ' and i > 0:
				mo.output = to_no_dakuon_kana(li[i-1].output[-1:])
			elif mo.hyouki == 'ヾ' and i > 0:
				mo.output = to_dakuon_kana(li[i-1].output[-1:])
			elif mo.hyouki == '々々々々' and i > 0:
				mo.output = li[i-1].output * 4
			elif mo.hyouki == '々々' and i > 0:
				mo.output = li[i-1].output * 2
			elif mo.hyouki == '々' and i > 0:
				if li[i-1].hyouki[0] == '々' and i > 1:
					mo.output = li[i-2].output
				elif len(li[i-1].hyouki) == 1:
					mo.output = li[i-1].output
				else:
					mo.output = '' # FIXME
			else:
				mo.output = mo.nhyouki
		if mo.hyouki == '.' and mo.hinshi1 == '名詞' and mo.hinshi2 == '数':
			mo.output = '.'
		if mo.hyouki == ',' and mo.hinshi1 == '名詞' and mo.hinshi2 == '数':
			mo.output = ','
		if mo.hinshi1 == '記号' and mo.hinshi2 == '句点' and mo.nhyouki == '.':
			mo.output = '.'
		if mo.hinshi1 == '記号' and mo.hinshi2 == '読点' and mo.nhyouki == ',':
			mo.output = ','

	for mo in li:
		# 情報処理点字の開始記号と終了記号
		if RE_INFOMATION.match(mo.nhyouki) and \
				('@' in mo.nhyouki) or ('://' in mo.nhyouki) or ('\\' in mo.nhyouki):
			if nabcc:
				mo.output = mo.nhyouki
			else:
				mo.output = '⠠⠦' + mo.nhyouki + '⠠⠴'
		# 外国語引用符
		# 空白をはさまない1単語は外国語引用符ではなく外字符で
		elif (
			RE_GAIJI.match(mo.nhyouki) and \
			((' ' in mo.nhyouki) or ("'" in mo.nhyouki))
		) or (
			('.' in mo.nhyouki) and \
			len(mo.nhyouki) > 3
		):
			if nabcc:
				mo.output = mo.nhyouki
			else:
				mo.output = '⠦' + mo.nhyouki + '⠴'

	if not nabcc:
		for mo in li:
			# 情報処理点字でも外国語引用符でもなく output が & を含む場合は前後をあける
			if not mo.output.startswith('⠠⠦') and not mo.output.startswith('⠦'):
				# &
				if mo.output == '&':
					continue
				# &xx
				elif mo.output.startswith('&'):
					mo.output = mo.output.replace('&', '& ')
				# xx&
				elif mo.output.endswith('&'):
					mo.output = mo.output.replace('&', ' &')
					# xx&xx
				else:
					mo.output = mo.output.replace('&', ' & ')
	
	if nabcc:
		for mo in li:
			mo.output = mo.output.replace('”', '"').replace('’', "'").replace('‘', '`')

	# 日付の和語読み処理
	li = fix_japanese_date_morphs(li)

	# 日本語の直後のコンマを '、' で解釈
	# before: ,,記号,読点,*,*,*,*,,,,,,,*/*,*
	# after:  、,記号,読点,*,*,*,*,、,、,、,*/*,*
	for pos in xrange(len(li) - 1):
		mo = li[pos]
		mo2 = li[pos + 1]
		if mo2.hyouki == ',' and not (
				mo.hinshi2 in ('アルファベット', '数', '括弧閉')
		):
			mo2.hyouki = mo2.nhyouki = mo2.output = '、'

	# 分かち書き判定
	for i in xrange(1, len(li)):
		prev2_mo = li[i-2] if i-2 >= 0 else None
		prev_mo = li[i-1]
		next_mo = li[i+1] if i+1 < len(li) else None
		li[i-1].sepflag = should_separate(prev2_mo, prev_mo, li[i], next_mo, nabcc=nabcc, logwrite=logwrite)

	# do not translate if string is unicode braille
	for i in xrange(0, len(li)):
		mo = li[i]
		if all((0x2800 <= ord(c) <= 0x28ff or c == '\u3000') for c in mo.hyouki):
			mo.output = mo.hyouki.replace('\u3000', ' ')
			mo.sepflag = False
			if i > 0:
				li[i-1].sepflag = False

	for mo in li:
		mo.write(logwrite)
	logwrite('')

	outbuf, inpos2 = morphs_to_string(li, inbuf, logwrite)

	if nabcc:
		outbuf = outbuf.replace(TAB_CODE, '⡀')
	else:
		outbuf = outbuf.replace(TAB_CODE, ' ')

	return (outbuf, inpos2)