def test_kakasi_unknown_rule(self): with self.assertRaises(UnsupportedRomanRulesException): kakasi = pykakasi.kakasi() kakasi.setMode("H","a") kakasi.setMode("K","a") kakasi.setMode("J","a") kakasi.setMode("r","hogefuga")
def test_kakasi_kunrei(self): TESTS = [ ("構成", "Kousei"), ("好き", "Suki"), ("大きい", "Ookii"), ("かんたん", "kantan"), ("にゃ", "nya"), ("っき", "kki"), ("っふぁ", "ffa"), ("漢字とひらがな交じり文", "Kanzi tohiragana Maziri Bun"), ("Alphabet 123 and 漢字", "Alphabet 123 and Kanzi"), ("日経新聞", "Nikkeisinbun"), ("日本国民は、","Nihonkokumin ha,") ] kakasi = pykakasi.kakasi() kakasi.setMode("H","a") kakasi.setMode("K","a") kakasi.setMode("J","a") kakasi.setMode("r","Kunrei") kakasi.setMode("C",True) kakasi.setMode("s",True) kakasi.setMode("E","a") converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_J2H(self): TESTS = [ (u"", ""), (u"構成", u"こうせい"), (u"好き", u"すき"), (u"大きい", u"おおきい"), (u"かんたん", u"かんたん"), (u"にゃ", u"にゃ"), (u"っき", u"っき"), (u"っふぁ", u"っふぁ"), (u"漢字とひらがな交じり文", u"かんじとひらがなまじりぶん"), (u"Alphabet 123 and 漢字", u"Alphabet 123 and かんじ"), (u"日経新聞", u"にっけいしんぶん"), (u"日本国民は、", u"にほんこくみんは、"), (u"苦々しい", u"にがにがしい") ] kakasi = pykakasi.kakasi() kakasi.setMode("H", None) kakasi.setMode("K", None) kakasi.setMode("J", "H") kakasi.setMode("s", False) kakasi.setMode("C", True) kakasi.setMode("E", None) kakasi.setMode("a", None) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_hepburn(self): TESTS = [ (u"", ""), (u"構成", "Kousei"), (u"好き", "Suki"), (u"大きい", "Ookii"), (u"かんたん", "kantan"), (u"にゃ", "nya"), (u"っき", "kki"), (u"っふぁ", "ffa"), (u"漢字とひらがな交じり文", "Kanji tohiragana Majiri Bun"), (u"Alphabet 123 and 漢字", "Alphabet 123 and Kanji"), (u"日経新聞", "Nikkeishinbun"), (u"日本国民は、", "Nihonkokumin ha,") ] kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") kakasi.setMode("r", "Hepburn") kakasi.setMode("s", True) kakasi.setMode("E", "a") kakasi.setMode("a", None) kakasi.setMode("C", True) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_E2a_upper(self): TESTS = [ (u"abcdefghijklmnopqrstuvwxyz", "ABCDEFGHIJKLMNOPQRSTUVWXYZ"), ] kakasi = pykakasi.kakasi() kakasi.setMode("E", "a") kakasi.setMode("U", True) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def __init__(self): self._load_codepoints('ja') self.kakasi = pykakasi.kakasi() self.kakasi.setMode("J","a") self.kakasi.setMode("E","a") self.kakasi.setMode("H","a") self.kakasi.setMode("K","a") self.kakasi.setMode("s", True) self.kakasi.setMode("C", True) self.conv=self.kakasi.getConverter()
def get_reading_kakasi(word): """Gets reading for a given Japanese word by using kakasi. The reading in hiragana is returned by this function.""" import pykakasi.kakasi as kakasi kakasi = kakasi() kakasi.setMode("J", "H") kakasi.setMode("C", True) # default: Separator kakasi.setMode("c", False) # default: no Capitalize conv = kakasi.getConverter() result = conv.do(word) return result
def test_kakasi_K2H(self): TESTS = [ (u"", ""), (u"カンタン", u"かんたん"), (u"ニャ", u"にゃ") ] kakasi = pykakasi.kakasi() kakasi.setMode("K", "H") converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_a2E(self): TESTS = [ ("ABCDEFGHIJKLMNOPQRSTUVWXYZ", u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"), ("abcdefghijklmnopqrstuvwxyz", u"abcdefghijklmnopqrstuvwxyz"), ("!\"#$%&'()*+,-./_ {|}~", u"!"#$%&'()*+,-./_ {|}~") ] kakasi = pykakasi.kakasi() kakasi.setMode("a", "E") converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_J2a_upper(self): TESTS = [ (u"かな漢字", "kana KANJI"), ] kakasi = pykakasi.kakasi() kakasi.setMode("J", "a") kakasi.setMode("H", "a") kakasi.setMode("s", True) kakasi.setMode("U", True) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_numbers(self): TESTS = [ (u"1234567890", "1234567890"), (u"一 二 三 四 五 六 七 八 九 〇", "ichi ni san shi go roku shichi hachi kyuu (maru)") ] kakasi = pykakasi.kakasi() kakasi.setMode("E", "a") kakasi.setMode("J", "a") converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_issues60(self): TESTS = [ (u"市立", u"しりつ") ] kakasi = pykakasi.kakasi() kakasi.setMode("H", None) kakasi.setMode("K", None) kakasi.setMode("J", "H") kakasi.setMode("s", False) kakasi.setMode("C", True) kakasi.setMode("E", None) kakasi.setMode("a", None) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_katakana_furiagana(self): TESTS = [ (u"変換前の漢字の脇に", u"変換前[ヘンカンマエ]の漢字[カンジ]の脇[ワキ]に") ] kakasi = pykakasi.kakasi() kakasi.setMode("H", None) kakasi.setMode("K", None) kakasi.setMode("J", "KF") kakasi.setMode("f", True) kakasi.setMode("s", False) kakasi.setMode("E", None) kakasi.setMode("a", None) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_hiragana_furiagana(self): TESTS = [ (u"変換前の漢字の脇に", u"変換前[へんかんまえ]の漢字[かんじ]の脇[わき]に") ] kakasi = pykakasi.kakasi() kakasi.setMode("H", None) kakasi.setMode("K", None) kakasi.setMode("J", "HF") kakasi.setMode("f", True) kakasi.setMode("s", False) kakasi.setMode("E", None) kakasi.setMode("a", None) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_constitution(self): original_text = u"日本国民は、正当に選挙された国会における代表者を通じて行動し、われらとわれらの子孫のために、諸国民との協和による成果と、わが国全土にわたつて自由のもたらす恵沢を確保し、政府の行為によつて再び戦争の惨禍が起ることのないやうにすることを決意し、ここに主権が国民に存することを宣言し、この憲法を確定する。そもそも国政は、国民の厳粛な信託によるものであつて、その権威は国民に由来し、その権力は国民の代表者がこれを行使し、その福利は国民がこれを享受する。これは人類普遍の原理であり、この憲法は、かかる原理に基くものである。われらは、これに反する一切の憲法、法令及び詔勅を排除する。" result = "Nihonkokumin ha, Seitou ni Senkyo sareta Kokkai niokeru Daihyousha wo Tsuuji te Koudou shi, wareratowarerano Shison notameni, Shokokumin tono Kyouwa niyoru Seika to, waga Kuni Zendo niwatatsute Jiyuu nomotarasu Keitaku wo Kakuho shi, Seifu no Koui niyotsute Futatabi Sensou no Sanka ga Okoru kotononaiyaunisurukotowo Ketsui shi, kokoni Shuken ga Kokumin ni Sonsu rukotowo Sengen shi, kono Kenpou wo Kakuteisu ru. somosomo Kokusei ha, Kokumin no Genshuku na Shintaku niyorumonodeatsute, sono Ken'i ha Kokumin ni Yurai shi, sono Kenryoku ha Kokumin no Daihyousha gakorewo Koushi shi, sono Fukuri ha Kokumin gakorewo Kyouju suru. koreha Jinruifuhen no Genri deari, kono Kenpou ha, kakaru Genri ni Motozuku monodearu. wareraha, koreni Hansu ru Issai no Kenpou, Hourei Oyobi Shouchoku wo Haijo suru." kakasi = pykakasi.kakasi() kakasi.setMode("H","a") kakasi.setMode("K","a") kakasi.setMode("J","a") kakasi.setMode("r","Hepburn") kakasi.setMode("C", True) kakasi.setMode("s", True) converter = kakasi.getConverter() self.maxDiff = None self.assertEqual(converter.do(original_text), result)
def test_kakasi_passport(self): TESTS = [ (u"", ""), (u"構成", "Kosei"), (u"大野", "Ono"), (u"斎藤", "Saito"), (u"菅野", "Kanno"), (u"本田", "Honda"), (u"一式", "Isshiki"), (u"別府", "Beppu"), (u"ジェ", "jie"), (u"チェ", "chie"), (u"ティ", "tei"), (u"ディ", "dei"), (u"デュ", "deyu"), (u"ファ", "fua"), (u"フィ", "fui"), (u"フェ", "fue"), (u"フォ", "fuo"), (u"ヴァ", "bua"), (u"ヴィ", "bui"), (u"ヴ", "bu"), (u"ヴェ", "bue"), (u"ヴォ", "buo"), (u"じぇ", "jie"), (u"ちぇ", "chie"), (u"てぃ", "tei"), (u"でぃ", "dei"), (u"でゅ", "deyu"), (u"ふぁ", "fua"), (u"ふぃ", "fui"), (u"ふぇ", "fue"), (u"ふぉ", "fuo") ] kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") kakasi.setMode("r", "Passport") kakasi.setMode("E", "a") kakasi.setMode("C", True) kakasi.setMode("a", None) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_passport_specialcase(self): TESTS = [ (u"えっちゅう", "etchu"), (u"はっちょう", "hatcho"), (u"エッチュウ", "etchu"), (u"ハッチョウ", "hatcho") ] kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") kakasi.setMode("r", "Passport") kakasi.setMode("E", "a") kakasi.setMode("a", None) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def test_kakasi_J2K(self): TESTS = [ (u"", ""), (u"構成", u"コウセイ"), (u"好き", u"スキ"), (u"大きい", u"オオキイ"), (u"かんたん", u"かんたん"), (u"漢字とひらがな交じり文", u"カンジとひらがなマジリブン"), (u"Alphabet 123 and 漢字", u"Alphabet 123 and カンジ") ] kakasi = pykakasi.kakasi() kakasi.setMode("H", None) kakasi.setMode("K", None) kakasi.setMode("J", "K") kakasi.setMode("s", False) kakasi.setMode("C", True) kakasi.setMode("E", None) kakasi.setMode("a", None) converter = kakasi.getConverter() for case, result in TESTS: self.assertEqual(converter.do(case), result)
def changpoyin(data,files_dir,i):#把长破音都转化为u,这里的i是文件的id from pykakasi import kakasi # 把单词转化为音素 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() data_1 = copy.deepcopy(data) data_2 = [] files_dir_1 = os.path.join(files_dir.replace('_yinsu', '')) data_danci = pipei.read_out(files_dir_1) # 单词级别的识别结果 for danci in data_1:#每次循环检查一个音素(音素単位) if ':' in danci[0]:#如果识别结果中出现冒号“:”就把单词单位的识别结果读出来看看到底是“u”还是“-” zhenshu = danci[1][0] for danci_1 in data_danci:#每一个循环查看一个单词(単語単位) if zhenshu >= danci_1[1][0] and zhenshu <= danci_1[1][1]: # 找到这个音素对应的汉字 # print('能找到') tanngou = conv.do(danci_1[0]) # 把这个汉字取出进行转化 if danci_1[0] == tanngou: # 说明是字母 tanngou = ztok._make_kana_convertor(danci_1[0]) if tanngou[-1] == 'u' or tanngou=='nado':#如果结尾是u那就把识别结果里的:转化为u fenjie = (danci[1][0] + danci[1][1])//2 danci_2 = copy.deepcopy(danci) danci_2[1][1] = fenjie danci_2[0] = danci_2[0].replace(':','') danci_3 = copy.deepcopy(danci) danci_3[1][0] = fenjie + 1 danci_3[0] = 'u' data_2.append(danci_2) data_2.append(danci_3) #这段代码等做过第一波实验之后再加进去(判断冒号之后是i的情况) ################################################################ # elif tanngou[-1] == 'i':#如果结尾是u那就把识别结果里的:转化为u # # fenjie = (danci[1][0] + danci[1][1])//2 # danci_2 = copy.deepcopy(danci) # danci_2[1][1] = fenjie # danci_2[0] = danci_2[0].replace(':','') # danci_3 = copy.deepcopy(danci) # danci_3[1][0] = fenjie + 1 # danci_3[0] = 'i' # data_2.append(danci_2) # data_2.append(danci_3) ############################################################### elif zifudingwei(tanngou,danci[0].replace(':',''),files_dir_1) == 'u':#把有冒号字母后面的那个字母单独拿出来 fenjie = (danci[1][0] + danci[1][1])//2 danci_2 = copy.deepcopy(danci) danci_2[1][1] = fenjie danci_2[0] = danci_2[0].replace(':','') danci_3 = copy.deepcopy(danci) danci_3[1][0] = fenjie + 1 danci_3[0] = 'u' data_2.append(danci_2) data_2.append(danci_3) else: data_2.append(danci) break else: data_2.append(danci)#如果不是包函冒号的音素,就直接加入新的list # print(i) # print(data_danci) # print(data) # print(data_2) # os.system('pause') return data_2
def setup_converter(self): mykakasi = kakasi() mykakasi.setMode('H', 'a') mykakasi.setMode('K', 'a') mykakasi.setMode('J', 'a') self.converter = mykakasi.getConverter()
def dabiaoqian(path): from pykakasi import kakasi BASE_DIRS = path # 批次 name_tezheng = 'mizhichuli_log' # 装有特征值的那个文件的文件名 xinde = 'xinde_mizhichuli' # 装入新的特征值的文件名 houzhui = '.wav.csv' # 特征值文件中除去id号之后的后缀部分 name = 'align1' # 表记着CCCCSSSS标志的文件 shibiejieguo = {} # 安放识别结果的字典 symbolcidian = {} # 这样的词典,标志词典 # id: C001L_086 # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C'] # id: C001L_087 # ['S', 'D', 'D', 'C'] # id: C001L_088 # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C'] zhengjie = {} # 正解文词典 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for per_dirs in os.listdir(BASE_DIRS): # per_dirs = C001L,C001R... d_9 = os.path.join(BASE_DIRS, per_dirs, xinde) d = os.path.join(BASE_DIRS, per_dirs, xinde) mulu.mkdir(d) zhengjie, symbolcidian = zidian.zidian(per_dirs, BASE_DIRS) # 从标志文件中把标志塞进symbolcidian字典里 for id in os.listdir( os.path.join(BASE_DIRS, per_dirs, name_tezheng)): # id = C001L,C001R下面的文件的名字 banyun_1 = [] # 存储C的索引 banyun_2 = [] # 存储正确的单词 banyun_3 = [] # 存储非C的索引 banyun_4 = [] # 存储暂时不正确的单词的拼音 dianout = [] id = id.replace(houzhui, '') # 把文件名中的.wav.csv去掉只剩id # print(id) # print(symbolcidian[id]) enumerate(symbolcidian[id]) banyun_1 = [i for i, x in enumerate(symbolcidian[id]) if x == 'C'] # 返回标志C的索引 banyun_3 = [i for i, x in enumerate(symbolcidian[id]) if x == 'S'] # 返回替换错误的单词的索引 t_file = os.path.join(BASE_DIRS, per_dirs, name_tezheng, id + houzhui) a = csv.reader(open(t_file, 'r', encoding='utf-8')) t_file_list = [i for i in a] # if len(banyun_1) == 0:#如果没有一个是正确的,全错,所有的数据都打标签1 # for i in range(len(t_file_list)): # t_file_list[i].insert(0, '1') # print(banyun_1) # print(banyun_3) # os.system("pause") for u in banyun_1: # banyun_1里面装的全是标志C的索引 if u + 1 <= len(zhengjie[id]): # 正解文单词的个数可能没有标志的个数多 # print(banyun_1) # print(zhengjie[id][u]) # print(zhengjie[id]) # print("已经把正确单词 %s 加入数组"%str(zhengjie[id][u])) banyun_2.append(zhengjie[id][u]) # banyun_2是存储正确单词的索引的数组 # print("此时的banyun_2是") # print(banyun_2) # os.system('pause') else: # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了 print("手动调一下这个文件吧%s" % id) print("它的正确单词是") print(banyun_2) os.system("pause") # print(banyun_2) # os.system('pause') for w in banyun_3: # 存储非C的索引 if w + 1 <= len(zhengjie[id]): # 正解文单词的个数可能没有标志的个数多 result = conv.do(zhengjie[id][w]) banyun_4.append(result) # if result == zhengjie[id][w] and zhengjie[id][w] != '、':#如果是逗号,也按正常的单词处理 # # banyun_4.append(conv.do(_make_kana_convertor(strQ2B(zhengjie[id][w]))))#如果转化之后的值不变,就说明遇到了字母,把字母转化为半角,再再转化为片假名,之后再转化为罗马字加入列表中 # else: # # banyun_4.append(result)#存储暂时不正确的单词 # print("此时的banyun_4是") # print(banyun_4) # os.system('pause') else: # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了 print("手动调一下这个文件吧%s" % id) print("它的认识出现错误的单词是") print(banyun_4) os.system("pause") # print(banyun_2) # os.system("pause") # for p in symbolcidian[id]: # os.system("pause") # # while p == 'C': # print(p.index('C')) dir_out = os.path.join(BASE_DIRS, per_dirs, 'keka', id + '.out') dianout = pi.read_out(dir_out) # 提取出来的帧号跟julius识别结果一样 # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] # [ 37 58] 0.562999 で+接続詞 [で] start = dianout.pop(0)[1][1] # print(start) for i in range(start + 1): t_file_list[i].insert(0, '9') # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for y in dianout: # dianout是识别结果跟对应的帧数表 # print("此时的单词是%s"%y) # print("此时的匹配结果是") # print(dianout) # os.system("pause") if y[1][1] + 1 <= len(t_file_list): # 判断这个单词的范围是否超出了特征值得总行数 if y[0] == '': # 跳过前面的无音区 continue if y[0] == dianout[-1][ 0]: # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,下面也有一段代码 start, end = y[1] for i in range(start, end + 1): t_file_list[i].insert(0, '9') continue if y[0] in banyun_2: # 如果这个单词存在列表banyun_2中,就给这个单词对应的帧数范围打标签0 start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_2.remove(y[0]) # 打完标签0之后再从列表中把这个单词删掉 elif conv.do( y[0]) == y[0] and y[0] != '、': # 如果是字母的话,转化之后还是字母 print("发现识别结果中的字母%s" % y[0]) print("它在文件%s" % dir_out) try: zhuanhuazhi = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(y[0]))) except: zhuanhuazhi = conv.do( make_kana_convertor._make_kana_convertor(y[0])) if zhuanhuazhi in banyun_4: # 需要先把字母转化为片假名然后再转化为读音 print("转化之后的字母为%s" % zhuanhuazhi) # os.system('pause') start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove( conv.do(zhuanhuazhi)) # 打完标签0之后再从列表中把这个单词删掉 else: start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '1') elif conv.do(y[0]) in banyun_4: start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove(conv.do(y[0])) # 打完标签0之后再从列表中把这个单词删掉 else: start, end = y[ 1] # 如果这个单词不在列表banyun_2中,就给这个单词对应的帧数范围打标签1 print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '1') elif y[1][1] + 1 > len(t_file_list): if y[0] == '': continue if y[0] == dianout[-1][0]: start = y[1][0] end = len(t_file_list) for i in range( start, end ): # 如果是y[1][1]+1 > len(t_file_list)的情况这里end就不能加一了 t_file_list[i].insert(0, '9') continue # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,上面也有一段代码 if y[0] in banyun_2: start = y[1][0] end = len( t_file_list) # 如果这个单词的帧数表的范围超出了特征值得行数,就以特征值行数作为end print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end): t_file_list[i].insert(0, '0') banyun_2.remove(y[0]) elif conv.do( y[0]) == y[0] and y[0] != '、': # 如果是字母的话,转化之后还是字母 if conv.do( make_kana_convertor._make_kana_convertor( y[0])) in banyun_4: # 需要先把字母转化为片假名然后再转化为读音 start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove( conv.do( make_kana_convertor._make_kana_convertor( y[0]))) # 打完标签0之后再从列表中把这个单词删掉 else: start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '1') else: start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end): t_file_list[i].insert(0, '1') with open(os.path.join(BASE_DIRS, per_dirs, xinde, id + '.csv'), 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(d_9) # 把有标记9的特征值全部都删除掉
from pykakasi import kakasi with open('../imas-talk-maker/public/asset/icon_data.json', mode='r', encoding='utf-8') as f: old_icon_data: List[Dict[str, Union[str, List[str]]]] = json.load(f) # 必要なディレクトリを整備する os.makedirs('./million', exist_ok=True) os.makedirs('./cinderella', exist_ok=True) os.makedirs('./other', exist_ok=True) # 各項目を読み込み、新しいデータセット、およびファイルを組み上げる new_icon_data = [] kakasi_instance = kakasi() kakasi_instance.setMode('H', 'a') kakasi_instance.setMode('K', 'a') kakasi_instance.setMode('J', 'a') kakasi_conv = kakasi_instance.getConverter() for record in old_icon_data: category: str = record['category'] icon_name_list: List[str] = record['image'] kana: str = record['kana'] name: str = record['name'] short_name: str = record['short_name'] icon_name_list_size = len(icon_name_list) kana_roma = kakasi_conv.do(kana.replace('、', '/').split('/')[0]) new_icon_list = [] for x in range(0, icon_name_list_size): old_name = icon_name_list[x]
import pykakasi import re import hiragana from typing import List from itertools import zip_longest # Initialize space converter wakati = pykakasi.wakati() spacer = wakati.getConverter() # Initialize kanji & katakana converter kakasi_kanji = pykakasi.kakasi() kakasi_kanji.setMode('K', 'H') kakasi_kanji.setMode('J', 'H') kanji_replacer = kakasi_kanji.getConverter() # Initialize romanji converter kakasi_romanji = pykakasi.kakasi() kakasi_romanji.setMode('H', 'a') kakasi_romanji.setMode('K', 'a') kakasi_romanji.setMode('J', 'a') kakasi_romanji.setMode('r', 'Hepburn') kakasi_romanji.setMode('s', True) romanjifier = kakasi_romanji.getConverter() def to_hiragana(word: str) -> str: hiragana_word = kanji_replacer.do(word) return hiragana_word
from pykakasi import kakasi # 读取文件,如果不加encoding='utf-8'的话,会报编码错误 read_me = open('as_a_rule.txt', 'r', encoding='utf-8') # 读 read_to = open('output.txt', 'w') # 输出 # //转换器1 bachongying = kakasi() # 转换成平假名 bachongying.setMode('J', 'H') Bachongying = bachongying.getConverter() # //转换器2,平假名,片假名转换成罗马音 : hequanshawu = kakasi() hequanshawu.setMode('H', 'a') # 平转罗 hequanshawu.setMode('K', 'a') # 片转罗 Hequanshawu = hequanshawu.getConverter() # 转化器已经写好了,接下来就是无脑读。。。 while True: s = read_me.readline() if s == '': break # number:数字 jp:日语 jph:平和片 jpa:罗马音 cn:中文 number = jp = jph = jpa = cn = '' index = 0 for char in s: if index == 0: if char == '、': index = 1 else:
' ').replace(':', ' ').replace('!', '') text = collapse_whitespace(text) return text import MeCab import codecs import argparse from pykakasi import kakasi import re re_hiragana = re.compile(r'^[あ-ん]+$') re_katakana = re.compile(r'[\u30A1-\u30F4]+') re_kanji = re.compile(r'^[\u4E00-\u9FD0]+$') j2h = kakasi() j2h.setMode('J', 'H') # J(Kanji) to H(Hiragana) conv = j2h.getConverter() k2h = kakasi() k2h.setMode('K', 'H') conv2 = k2h.getConverter() t = MeCab.Tagger('') def japanese_cleaners(text): '''Pipeline for English text, including number and abbreviation expansion.''' node = t.parse(text).replace("\t", ",").split("\n") res = [] for i in node: #import pdb; pdb.set_trace()
def test_kakasi_invalid_flag_value(self): with self.assertRaises(InvalidFlagValueException): kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("s", "yes")
def test_kakasi_unknown_mode(self): with self.assertRaises(InvalidModeValueException): kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "X")
import re import sys from ssl import SSLWantReadError import requests import keyboard from pykakasi import kakasi, wakati from bs4 import BeautifulSoup REQUEST_SESSION = requests.session() KAKASI = kakasi() KAKASI.setMode("J", "H") KAKASI_CONVERTER = KAKASI.getConverter() WAKATI = wakati() WAKATI_CONVERTER = WAKATI.getConverter() def get_relevant_data(result_page): """Grabs the relevant information from the term's dictionary page and passes it to the hotkey function to be pasted into Anki.""" kanji = result_page.find("div", class_="jp").text.replace("·", "") kanji = re.sub("(\(.{1,3}\))", "", kanji) try: kana = result_page.find("div", class_="furigana").text.replace( "[", "").replace("]", "").replace("·", "") kana = re.sub("(\(.{1,3}\))", "", kana) except AttributeError: kana = kanji romaji = result_page.find("div", class_="romaji hide").text term_definition = result_page.find( "div", class_="en").find("ol").text.rstrip().lstrip()
def main(): kakasi_ = kakasi() kakasi_.setMode('H', 'a') kakasi_.setMode('K', 'a') kakasi_.setMode('J', 'a') conv = kakasi_.getConverter() #カスケード分類器の特徴量を取得する cascade = cv2.CascadeClassifier(cascade_path) color = (255, 255, 255) #白 path = "./img/" label = str(input("人を判別する数字を入力してください ex.0:")) OUT_FILE_NAME = "./img/face_recognition.avi" FRAME_RATE=1 w=224 #1280 h=224 #960 out = cv2.VideoWriter(OUT_FILE_NAME, \ cv_fourcc('M', 'P', '4', 'V'), \ FRAME_RATE, \ (w, h), \ True) cap = cv2.VideoCapture(1) is_video = 'False' s=0.1 model=model_definition() ## Use HGS創英角ゴシックポップ体標準 to write Japanese. fontpath ='C:\Windows\Fonts\HGRPP1.TTC' # Windows10 だと C:\Windows\Fonts\ 以下にフォントがあります。 font = ImageFont.truetype(fontpath, 16) # フォントサイズが32 font0 = cv2.FONT_HERSHEY_SIMPLEX sk=0 while True: b,g,r,a = 0,255,0,0 #B(青)・G(緑)・R(赤)・A(透明度) timer = cv2.getTickCount() ret, frame = cap.read() sleep(s) fps = cv2.getTickFrequency() / (cv2.getTickCount() - timer); # Display FPS on frame cv2.putText(frame, "FPS : " + str(int(1000*fps)), (100,50), font0, 0.75, (50,170,50), 2); #グレースケール変換 image_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) #image_gray = cv2.equalizeHist(image_gray) facerect = cascade.detectMultiScale(image_gray, scaleFactor=1.1, minNeighbors=2, minSize=(30, 30)) #print(len(facerect)) img=frame if len(facerect) > 0: #検出した顔を囲む矩形の作成 for rect in facerect: x=rect[0] y=rect[1] width=rect[2] height=rect[3] roi = img[y:y+height, x:x+width] #frame[y:y+h, x:x+w] cv2.rectangle(img, tuple(rect[0:2]),tuple(rect[0:2]+rect[2:4]), color, thickness=2) try: roi = cv2.resize(roi, (int(224), 224)) cv2.imshow('roi',roi) txt, preds=yomikomi(model,roi) print("txt, preds",txt,preds*100 ," %") txt2=conv.do(txt) cv2.imwrite(path+"/"+label+"/"+str(sk)+'_'+str(txt2)+'_'+str(int(preds*100))+'.jpg', roi) img_pil = Image.fromarray(img) # 配列の各値を8bit(1byte)整数型(0~255)をPIL Imageに変換。 draw = ImageDraw.Draw(img_pil) # drawインスタンスを生成 position = (x, y) # テキスト表示位置 draw.text(position, txt, font = font , fill = (b, g, r, a) ) # drawにテキストを記載 fill:色 BGRA (RGB) img = np.array(img_pil) # PIL を配列に変換 except: txt="" continue cv2.imshow('test',img) sk +=1 key = cv2.waitKey(1)&0xff if is_video=="True": img_dst = cv2.resize(img, (int(224), 224)) #1280x960 out.write(img_dst) print(is_video) if key == ord('q'): #113 #cv2.destroyAllWindows() break elif key == ord('p'): s=0.5 is_video = "True" elif key == ord('s'): s=0.1 is_video = "True" #"False"
from pykakasi import kakasi import datetime kk = kakasi() kk.setMode('H', 'a') kk.setMode('K', 'a') kk.setMode('J', 'a') conv = kk.getConverter() def to_roma(word): return conv.do(word) def to_shell(cmd_list, path, is_parallel=True): join_str = ' &\n' if is_parallel else '\n' if type(cmd_list) == list: text = join_str.join(cmd_list) else: text = cmd_list with open(path, 'w') as file: file.write(text)
def __init_kakasi(self): _kakasi = kakasi() _kakasi.setMode('H', 'a') # hiragana to roman _kakasi.setMode('K', 'a') # katakana to roman self.conv = _kakasi.getConverter()
def calculate_name(filelist, romanize=True, ext='.xci'): from Fs import Nsp as squirrelNSP from Fs import Xci as squirrelXCI import re prlist = list() for filepath in filelist: if filepath.endswith('.nsp'): try: c = list() f = squirrelNSP(filepath) contentlist = f.get_content(False, False, True) f.flush() f.close() if len(prlist) == 0: for i in contentlist: prlist.append(i) else: for j in range(len(contentlist)): notinlist = False for i in range(len(prlist)): if contentlist[j][1] == prlist[i][1]: if contentlist[j][6] > prlist[i][6]: del prlist[i] prlist.append(contentlist[j]) notinlist = False elif contentlist[j][6] == prlist[i][6]: notinlist = False else: notinlist = True if notinlist == True: prlist.append(contentlist[j]) except BaseException as e: nutPrint.error('Exception: ' + str(e)) if filepath.endswith('.xci'): try: c = list() f = squirrelXCI(filepath) contentlist = f.get_content(False, False, True) f.flush() f.close() if len(prlist) == 0: for i in contentlist: prlist.append(i) else: for j in range(len(contentlist)): notinlist = False for i in range(len(prlist)): if contentlist[j][1] == prlist[i][1]: if contentlist[j][6] > prlist[i][6]: del prlist[i] prlist.append(contentlist[j]) notinlist = False elif contentlist[j][6] == prlist[i][6]: notinlist = False else: notinlist = True if notinlist == True: prlist.append(contentlist[j]) except BaseException as e: nutPrint.error('Exception: ' + str(e)) basecount = 0 basename = '' basever = '' baseid = '' basefile = '' updcount = 0 updname = '' updver = '' updid = '' updfile = '' dlccount = 0 dlcname = '' dlcver = '' dlcid = '' dlcfile = '' ccount = '' bctag = '' updtag = '' dctag = '' for i in range(len(prlist)): if prlist[i][5] == 'BASE': basecount += 1 if baseid == "": basefile = str(prlist[i][0]) baseid = str(prlist[i][1]) basever = '[v' + str(prlist[i][6]) + ']' if prlist[i][5] == 'UPDATE': updcount += 1 endver = str(prlist[i][6]) if updid == "": updfile = str(prlist[i][0]) updid = str(prlist[i][1]) updver = '[v' + str(prlist[i][6]) + ']' if prlist[i][5] == 'DLC': dlccount += 1 if dlcid == "": dlcfile = str(prlist[i][0]) dlcid = str(prlist[i][1]) dlcver = '[v' + str(prlist[i][6]) + ']' if basecount != 0: bctag = str(basecount) + 'G' else: bctag = '' if updcount != 0: if bctag != '': updtag = '+' + str(updcount) + 'U' else: updtag = str(updcount) + 'U' else: updtag = '' if dlccount != 0: if bctag != '' or updtag != '': dctag = '+' + str(dlccount) + 'D' else: dctag = str(dlccount) + 'D' else: dctag = '' ccount = '(' + bctag + updtag + dctag + ')' if baseid != "": if basefile.endswith('.xci'): f = squirrelXCI(basefile) elif basefile.endswith('.nsp'): f = squirrelNSP(basefile) ctitl = f.get_title(baseid) f.flush() f.close() if ctitl == 'DLC' or ctitl == '-': ctitl = '' elif updid != "": if updfile.endswith('.xci'): f = squirrelXCI(updfile) elif updfile.endswith('.nsp'): f = squirrelNSP(updfile) ctitl = f.get_title(updid) f.flush() f.close() if ctitl == 'DLC' or ctitl == '-': ctitl = '' elif dlcid != "": ctitl = get_title if dlcfile.endswith('.xci'): f = squirrelXCI(dlcfile) elif dlcfile.endswith('.nsp'): f = squirrelNSP(dlcfile) ctitl = f.get_title(dlcid) f.flush() f.close() else: ctitl = 'UNKNOWN' baseid = '[' + baseid.upper() + ']' updid = '[' + updid.upper() + ']' dlcid = '[' + dlcid.upper() + ']' if ccount == '(1G)' or ccount == '(1U)' or ccount == '(1D)': ccount = '' if baseid != "[]": if updver != "": endname = ctitl + ' ' + baseid + ' ' + updver + ' ' + ccount else: endname = ctitl + ' ' + baseid + ' ' + basever + ' ' + ccount elif updid != "[]": endname = ctitl + ' ' + updid + ' ' + updver + ' ' + ccount else: endname = ctitl + ' ' + dlcid + ' ' + dlcver + ' ' + ccount if romanize == True: import pykakasi kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") kakasi.setMode("K", "a") kakasi.setMode("J", "a") kakasi.setMode("s", True) kakasi.setMode("E", "a") kakasi.setMode("a", None) kakasi.setMode("C", False) converter = kakasi.getConverter() endname = converter.do(endname) endname = endname[0].upper() + endname[1:] endname = (re.sub(r'[\/\\\:\*\?]+', '', endname)) endname = re.sub(r'[™©®`~^´ªº¢#£€¥$ƒ±¬½¼♡«»±•²‰œæƳ☆<<>>|]', '', endname) endname = re.sub(r'[Ⅰ]', 'I', endname) endname = re.sub(r'[Ⅱ]', 'II', endname) endname = re.sub(r'[Ⅲ]', 'III', endname) endname = re.sub(r'[Ⅳ]', 'IV', endname) endname = re.sub(r'[Ⅴ]', 'V', endname) endname = re.sub(r'[Ⅵ]', 'VI', endname) endname = re.sub(r'[Ⅶ]', 'VII', endname) endname = re.sub(r'[Ⅷ]', 'VIII', endname) endname = re.sub(r'[Ⅸ]', 'IX', endname) endname = re.sub(r'[Ⅹ]', 'X', endname) endname = re.sub(r'[Ⅺ]', 'XI', endname) endname = re.sub(r'[Ⅻ]', 'XII', endname) endname = re.sub(r'[Ⅼ]', 'L', endname) endname = re.sub(r'[Ⅽ]', 'C', endname) endname = re.sub(r'[Ⅾ]', 'D', endname) endname = re.sub(r'[Ⅿ]', 'M', endname) endname = re.sub(r'[—]', '-', endname) endname = re.sub(r'[√]', 'Root', endname) endname = re.sub(r'[àâá@äå]', 'a', endname) endname = re.sub(r'[ÀÂÁÄÅ]', 'A', endname) endname = re.sub(r'[èêéë]', 'e', endname) endname = re.sub(r'[ÈÊÉË]', 'E', endname) endname = re.sub(r'[ìîíï]', 'i', endname) endname = re.sub(r'[ÌÎÍÏ]', 'I', endname) endname = re.sub(r'[òôóöø]', 'o', endname) endname = re.sub(r'[ÒÔÓÖØ]', 'O', endname) endname = re.sub(r'[ùûúü]', 'u', endname) endname = re.sub(r'[ÙÛÚÜ]', 'U', endname) endname = re.sub(r'[’]', "'", endname) endname = re.sub(r'[“”]', '"', endname) endname = re.sub(' {3,}', ' ', endname) re.sub(' {2,}', ' ', endname) try: endname = endname.replace("( ", "(") endname = endname.replace(" )", ")") endname = endname.replace("[ ", "[") endname = endname.replace(" ]", "]") endname = endname.replace("[ (", "[(") endname = endname.replace(") ]", ")]") endname = endname.replace("[]", "") endname = endname.replace("()", "") endname = endname.replace('" ', '"') endname = endname.replace(' "', '"') endname = endname.replace(" !", "!") endname = endname.replace(" ?", "?") endname = endname.replace(" ", " ") endname = endname.replace(" ", " ") endname = endname.replace('"', '') endname = endname.replace(')', ') ') endname = endname.replace(']', '] ') endname = endname.replace("[ (", "[(") endname = endname.replace(") ]", ")]") endname = endname.replace(" ", " ") except: pass if endname[-1] == ' ': endname = endname[:-1] endname = endname + ext return endname, prlist
def __init__(self, bot: commands.Bot): print('Ktoba OK') self.bot = bot self.kakasi = kakasi()
def conv_to_kana(word): responce = requests.post( 'https://labs.goo.ne.jp/api/hiragana', data={ "app_id": "c6d687dfedd172e4a2b30cc086513cfbb23f8c039c8c157fe08760b3df4092fa", "request_id": "test", "sentence": word, "output_type": "hiragana" }) return responce.json["converted"] kakasi = kakasi() kakasi.setMode('H', 'a') kakasi.setMode('K', 'a') kakasi.setMode('J', 'a') converter = kakasi.getConverter() boin = ['a', 'i', 'u', 'e', 'o'] def get_distance(wordx, wordy): wordx = converter.do(wordx) wordy = converter.do(wordy) #print(wordx + " " + wordy) dp = np.empty((len(wordx) + 1, len(wordy) + 1)) for i in range(len(wordx) + 1):
def dabiaoqian(path, guanjianzi_1, guanjianzi_2): from pykakasi import kakasi import csv, os name_tezheng = guanjianzi_1 # 装有特征值的那个文件的文件名 xinde = guanjianzi_2 # 装入新的特征值的文件名 name1 = 'align1' name2 = 'symbol.txt' #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for i in os.listdir(path): path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka') path_tezheng = os.path.join(path_1, name_tezheng) #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP')) # 把标志文件读进来 biaozhiwenjian = csv.reader( open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件 biaozhiwenjian_1 = [i for i in biaozhiwenjian ] # 转化为list,但是内容是list里面套list #[['id: l_8840_9810_T1_F_01'],['REF: そう です か 、 はい 。 '],['HYP: そう です か はい 。 '],['EVAL: C C C D C C '],[],['id: l_10800_13190_T1_F_01']] # print(biaozhiwenjian_1) # os.system('pause') path_xinde = os.path.join(path_1, xinde) mulu.mkdir(path_xinde) for i in range(0, len(biaozhiwenjian_1)): # 这里的每一轮可以为一个语音文件打标签 try: biaozhi = biaozhiwenjian_1[i][0] except: continue if 'id:' in biaozhi: l_zhengjie_1 = [] l_jieguo_1 = [] ID = biaozhiwenjian_1[i][0].replace('id: ', '') l_zhengjie = biaozhiwenjian_1[i + 1][0].split() l_zhengjie.pop(0) l_jieguo = biaozhiwenjian_1[i + 2][0].split() l_jieguo.pop(0) l_biaozhi = biaozhiwenjian_1[i + 3][0].split() l_biaozhi.pop(0) # try: # ID = biaozhiwenjian_1[i].replace('id: ', '') # # l_zhengjie = biaozhiwenjian_1[i+1].split() # l_zhengjie.pop(0) # # l_jieguo = biaozhiwenjian_1[i+2].split() # l_jieguo.pop(0) # # l_biaozhi = biaozhiwenjian_1[i+3].split() # l_biaozhi.pop(0) # # except: # print(biaozhiwenjian_1[i]) # os.system("pause") #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空 jishuqi_jieguo = 0 jishuqi_zhengjie = 0 jishuqi_biaozhi = 0 for i in l_biaozhi: if i == "D": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append('') jishuqi_zhengjie += 1 jishuqi_biaozhi += 1 if i == "C": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) # print('l_jieguo') # print(l_jieguo) # os.system('pause') l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "I": l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) l_zhengjie_1.append('') jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "S": #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie] jieguo_hanzi = l_jieguo[jishuqi_jieguo] #先处理识别结果 if conv.do( jieguo_hanzi ) == jieguo_hanzi and jieguo_hanzi != '、': #判断是不是字母 try: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(jieguo_hanzi))) except: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( jieguo_hanzi)) else: zhuanhuan_jieguo = conv.do(jieguo_hanzi) #再处理正解文 if conv.do( zhengjie_hanzi ) == zhengjie_hanzi and zhengjie_hanzi != '、': # 判断是不是字母 try: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(zhengjie_hanzi))) except: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( zhengjie_hanzi)) else: zhuanhuan_zhengjie = conv.do(zhengjie_hanzi) if zhuanhuan_jieguo == zhuanhuan_zhengjie: # print("正解list") # print(l_zhengjie_1) # # print("识别结果list") # print(l_jieguo_1) # # print("zhuanhuan_jieguo") # print(zhuanhuan_jieguo) # print("zhuanhuan_zhengjie") # print(zhuanhuan_zhengjie) # print("有标志被改了") # print(ID) # os.system("pause") l_biaozhi[jishuqi_biaozhi] = 'C' jishuqi_biaozhi += 1 jishuqi_zhengjie += 1 jishuqi_jieguo += 1 # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) # os.system('pause') path_out_1 = os.path.join(path_out, ID + '.out') #读出.out文件 dianout = pi.read_out(path_out_1) start = dianout.pop(0)[1][1] # 给开始的无音区间打标签9,pop掉第一个元素 start_1 = dianout[-1][1][0] #给末尾句号打标签9 # end_1 = dianout.pop(-1)[1][1] 因为在提取特征值的时候最后一帧可能被丢了,所以这个end就用t_file_list的条数代替 # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv') tezhengzhi = csv.reader( open(path_tezheng_1, 'r', encoding='utf-8')) t_file_list = [i for i in tezhengzhi] end_1 = len(t_file_list) - 1 if start < len(t_file_list): #如果.out文件的空白部分的帧数范围大于特征值的行数,就扔了 for i in range(start + 1): t_file_list[i].insert( 0, '9') # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for i in range(start_1, end_1 + 1): t_file_list[i].insert(0, '9') l_jieguo_1.pop(-1) #最后句号的部分已经打过标签了,需要把它pop掉 print("ID") print(ID) print("l_biaozhi") print(l_biaozhi) print("l_jieguo_1") print(l_jieguo_1) print("dianout") print(dianout) dianout_chongzao = cz.chongzao( l_biaozhi, l_jieguo_1, dianout, ID) # 生成新的dianoutlist,以后就靠它了 print('dianout_chongzao') print(dianout_chongzao) #通过得到的新的list,开始打标签,这个list中的单词是scoring工具输出的识别结果的单词,也就是被拼凑过的 # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'],['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'],['で', [191, 209], 'C']] for \ i in dianout_chongzao: start, end = i[1] if i[2] == 'C': for i in range(start, end + 1): t_file_list[i].insert(0, '0') else: for i in range(start, end + 1): t_file_list[i].insert(0, '1') path_xinde_tezhengzhi = os.path.join( path_xinde, ID + '.csv') with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(path_xinde) # 把有标记9的特征值全部都删除掉
def __init__(self, bot): self.bot = bot self.jp_channels = [804115491047079966] self.KKS = pykakasi.kakasi()
def convert_into_romaji(text): _kakasi = kakasi() _kakasi.setMode('H', 'a') _kakasi.setMode('K', 'a') _kakasi.setMode('J', 'a') conv = _kakasi.getConverter() mecab = MeCab.Tagger(r'-Ochasen -d ' + IPADIC_DIR) eng = MeCab.Tagger(r'-d ' + UNIDIC_DIR) normalized_text = normalize('NFKC', text) filename_romaji = "" #ipa unidic併用 results = mecab.parse(normalized_text) for chunk in results.splitlines()[:-1]: engFlag = False eng_work = "" original = chunk.split('\t') isEng = eng.parse(original[0]) for word in isEng.splitlines()[:-1]: work = word.split('\t')[1] comma = work.split(',') if len(comma) > 12 and comma[12] == '外': engFlag = True engTrance = comma[7] hyphen = engTrance.split('-') if len(hyphen) < 2: engFlag = False break else: eng_work += hyphen[1] + " " if not engFlag: #数字を英語にするべき?→1-20だけを処理する #変換対象を読み(ひらがな)にするべき?→英語判定は漢字、ローマ字変換はカタカナの文字を使う if original[0] in d.keys(): filename_romaji += d[original[0]] + " " else: filename_romaji += conv.do(original[1]) + " " else: filename_romaji += eng_work """ # unidicオンリー isEng = eng.parse(text) for word in isEng.splitlines()[:-1]: engFlag = False eng_work = "" original = word.split('\t') comma = original[1].split(',') if len(comma) > 12 and comma[12] == '外': engTrance = comma[7] hyphen = engTrance.split('-') if len(hyphen) > 1: engFlag = True eng_work += hyphen[1] + " " if not engFlag: #数字を英語にするべき?→1-20だけを処理する if original[0] in d.keys(): filename_romaji += d[original[0]] + " " else: filename_romaji += conv.do(original[1]) + " " else: filename_romaji += eng_work """ #ローマ字と数字以外を空白にするべき?→Yes75 No25 filename_romaji = re.sub(r'[^a-zA-Z0-9_ ]*', "", filename_romaji) #小文字にするべき?→Yes return filename_romaji.lower()
def __init__(self, master=None): super().__init__(master) self.pack() # self.wRoot, self.hRoot = 390, 400 self.wRoot, self.hRoot = 800, 500 self.master.title(u"OpenCVの動画表示") # ウィンドウタイトル self.master.geometry("{0}x{1}".format(self.wRoot, self.hRoot)) # ウィンドウサイズ(幅x高さ) # Canvasの作成 self.canvas = tk.Canvas(self.master, highlightthickness=0) # Canvasにマウスイベント(左ボタンクリック)の追加 self.canvas.bind('<Button-1>', self.canvas_click) # Canvasを配置 self.canvas.pack(expand=1, fill=tk.BOTH) # 文字入力フォームの作成 self.entry1 = tkinter.Entry(self.master, font=("", 20)) self.entry1.focus_set() self.entry1.pack() # カメラをオープンする self.capture = cv2.VideoCapture(0) # 画面入力フォームよりカメラの大きさを大きくしておく self.capture.set(cv2.CAP_PROP_FRAME_WIDTH, 1920) # カメラ画像の横幅を1280に設定 self.capture.set(cv2.CAP_PROP_FRAME_HEIGHT, 1080) # カメラ画像の縦幅を720に設定 self.disp_id = None self.detector = htm.handDetectior(MaxHands=2, detectonCon=0.7) self.kakasi = pykakasi.kakasi() # 基本設定 # ウィンドウの大きさの設定 self.wCam, self.hCam = 1000, 700 self.spaceH = 300 self.spaceW = 150 self.wVisal = self.wCam-self.spaceW self.hVisal = self.hCam-self.spaceH self.INPUT_TEXTS = u"" self.INPUT_TEXTS_UI = u"" self.KEYBOARDLIST = np.full((5, 5), False).tolist() self.KEYBOARDREMEN = True self.xx, self.yy = 0, 0 self.INPUT_FLAG = False self.font_size = 50 # self.font_Path = "C:\Windows\Fonts\メイリオ\meiryo.ttc" # self.font_Path_Bold = "C:\Windows\Fonts\メイリオ\meiryob.ttc" self.font_Path = "C:\Windows\Fonts\游ゴシック\YuGothR.ttc" self.font_Path_Bold = "C:\Windows\Fonts\游ゴシック\YuGothB.ttc" # 初期キーボードはかな入力に self.KEYBOARD = KEYBOARD_HIRA self.EVENT_Flag = 0 self.search_text = "" self.Dict_num = 0 self.Result_Button_pressed = [False]*4 self.Detail_Button_pressed = [False] self.Books_num = -1 self.savedIMG_Result = [] self.savedIMG_Detail = []
def dabiaoqian(path): from pykakasi import kakasi import csv, os name_tezheng = 'log' # 装有特征值的那个文件的文件名 xinde = 'xinde_log' # 装入新的特征值的文件名 houzhui = '.wav.csv' # 特征值文件中除去id号之后的后缀部分 name = 'align1' # 表记着CCCCSSSS标志的文件 name1 = 'align1' name2 = 'align1.txt' shibiejieguo = {} # 安放识别结果的字典 symbolcidian = {} # 这样的词典,标志词典 # id: C001L_086 # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C'] # id: C001L_087 # ['S', 'D', 'D', 'C'] # id: C001L_088 # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C'] zhengjie = {} # 正解文词典 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for i in os.listdir(path): path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka') path_tezheng = os.path.join(path_1, name_tezheng) biaozhiwenjian = csv.reader( open(os.path.join(path_1, name1), 'r', encoding='EUC-JP')) # 把标志文件读进来 # biaozhiwenjian = csv.reader(open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件 biaozhiwenjian_1 = [i for i in biaozhiwenjian ] # 转化为list,但是内容是list里面套list #[['id: l_8840_9810_T1_F_01'],['REF: そう です か 、 はい 。 '],['HYP: そう です か はい 。 '],['EVAL: C C C D C C '],[],['id: l_10800_13190_T1_F_01']] # print(biaozhiwenjian_1) # os.system('pause') path_xinde = os.path.join(path_1, xinde) mulu.mkdir(path_xinde) for i in range(0, len(biaozhiwenjian_1), 5): #这里的每一轮可以为一个语音文件打标签 ID = '' l_biaozhi = [] l_zhengjie = [] l_zhengjie_1 = [] l_jieguo = [] l_jieguo_1 = [] ID = biaozhiwenjian_1[i][0].replace('id: ', '') l_zhengjie = biaozhiwenjian_1[i + 1][0].split() l_zhengjie.pop(0) l_jieguo = biaozhiwenjian_1[i + 2][0].split() l_jieguo.pop(0) l_biaozhi = biaozhiwenjian_1[i + 3][0].split() l_biaozhi.pop(0) # try: # ID = biaozhiwenjian_1[i].replace('id: ', '') # # l_zhengjie = biaozhiwenjian_1[i+1].split() # l_zhengjie.pop(0) # # l_jieguo = biaozhiwenjian_1[i+2].split() # l_jieguo.pop(0) # # l_biaozhi = biaozhiwenjian_1[i+3].split() # l_biaozhi.pop(0) # # except: # print(biaozhiwenjian_1[i]) # os.system("pause") #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空 jishuqi_jieguo = 0 jishuqi_zhengjie = 0 for i in l_biaozhi: if i == "D": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append('') jishuqi_zhengjie += 1 if i == "C": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 if i == "I": l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) l_zhengjie_1.append('') jishuqi_jieguo += 1 if i == "S": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) # os.system('pause') path_out_1 = os.path.join(path_out, ID + '.out') dianout = pi.read_out(path_out_1) # print(dianout) # os.system('pause') path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv') tezhengzhi = csv.reader(open(path_tezheng_1, 'r', encoding='utf-8')) t_file_list = [i for i in tezhengzhi] dimension = len(t_file_list[0]) start = dianout.pop(0)[1][1] #给开始的无音区间打标签9 for i in range(start + 1): t_file_list[i].insert(0, '9') # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理 zhenshubiao = {} #给每个单词都建立一个词典 for i in dianout: zhenshubiao[i[0]] = i[1] #一个.out文件中的每个单词都建立一个对应的字典 start, end = zhenshubiao['。'] #给最后面的句号的部分打上标签9 for i in range(start, end + 1): t_file_list[i].insert(0, '9') # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] # while 'D' in l_biaozhi: # l_biaozhi.remove('D') # 一次性只会删除一个D,所以要用while l_biaozhi_1 = [i for i, x in enumerate(l_biaozhi) if x == 'S'] # 返回标志S的索引 # print(len(l_biaozhi_1)) # os.system('pause') if len(l_biaozhi_1) != 0: #如果l_biaozhi_1里面没有单词,说明全部都被正确认识了 # print('l_jieguo_1') # print(l_jieguo_1) # # print('l_biaozhi_1') # print(l_biaozhi_1) # # print('l_biaozhi') # print(l_biaozhi) # # print('l_zhengjie_1') # print(l_zhengjie_1) # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) for y in l_biaozhi_1: #处理标志s对应的单词,把正解文和识别结果都转化为字母再比较一次 # print("现在输出y的值") # print(y) # # print('现在输出l_jieguo_1[y]') # print(l_jieguo_1[y]) # print(ID) # os.system('pause') #先处理识别结果 if conv.do( l_jieguo_1[y] ) == l_jieguo_1[y] and l_jieguo_1[y] != '、': #判断是不是字母 try: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(l_jieguo_1[y]))) except: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( l_jieguo_1[y])) else: zhuanhuan_jieguo = conv.do(l_jieguo_1[y]) #再处理正解文 if conv.do( l_zhengjie_1[y] ) == l_zhengjie_1[y] and l_zhengjie_1[y] != '、': # 判断是不是字母 try: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(l_zhengjie_1[y]))) except: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( l_zhengjie_1[y])) else: zhuanhuan_zhengjie = conv.do(l_zhengjie_1[y]) # print('l_jieguo_1[y]') # print(l_jieguo_1[y]) # os.system('pause') guanjianzi = l_jieguo_1[y] #把S对应的单词取出来 # print('guanjianzi') # print(guanjianzi) # os.system('pause') # # print('zhenshubiao') # print(zhenshubiao[guanjianzi]) # os.system('pause') try: start, end = zhenshubiao[guanjianzi] #把这个单词对应的帧数范围取出来 except: print('ID') print(ID) print('zhenshubiao') print(zhenshubiao) print('guanjianzi') print(guanjianzi) os.system('pause') for i in range(start, end + 1): if zhuanhuan_jieguo == zhuanhuan_zhengjie: t_file_list[i].insert(0, '0') else: t_file_list[i].insert(0, '1') jishuqi_tezhengzhi = 0 for i in t_file_list: #给被正确识别的单词打标签0 # if i[0] != '0' and i[0] != '1' and i[0] != '9': if len(i[0]) == dimension: t_file_list[jishuqi_tezhengzhi].insert(0, '0') jishuqi_tezhengzhi += 1 path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv') with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(path_xinde) # 把有标记9的特征值全部都删除掉
def buildVocabulary(self, filePath): print('Loading vocabulary from ' + filePath) df = pd.read_excel(filePath) print('Vocabulary file loaded') wordList = [] partOfSpeechList = {} lessonList = {} kksi = kakasi() kksi.setMode("J", "H") for index, row in df.iterrows(): if self.checkValidData(row): lesson_num = row['lesson'] pos_list = self.parsePartOfSpeech(row['pos']) if row['intransitive'] == 't': isTransitive = True elif row['intransitive'] == 'i': isTransitive = False else: isTransitive = None pre_japanese = self.convertNanToEmptyString(row['preJapanese']) pre_japanese_particle = self.convertNanToEmptyString( row['preJapaneseParticle']) japanese_all_hiragana = row['japaneseAllHiragana'] japanese = row['japanese'] post_japanese = self.convertNanToEmptyString( row['postJapanese']) pre_english = self.convertNanToEmptyString(row['preEnglish']) english = row['english'] post_english = self.convertNanToEmptyString(row['postEnglish']) word = Word(lesson_num, pos_list, isTransitive, pre_japanese, pre_japanese_particle, japanese_all_hiragana, japanese, post_japanese, pre_english, english, post_english) wordList.append(word) indexOfAddedWord = len(wordList) - 1 if lesson_num not in lessonList: indices = [] else: indices = lessonList[lesson_num] indices.append(indexOfAddedWord) lessonList[lesson_num] = indices for pos in pos_list: if pos not in partOfSpeechList: indices = [] else: indices = partOfSpeechList[pos] indices.append(indexOfAddedWord) partOfSpeechList[pos] = indices # for posType in PartOfSpeech: # for index in partOfSpeechList[posType]: # assert posType in wordList[index].partOfSpeech print('Vocabulary built') # wordListJson = json.dumps([ob.__dict__ for ob in wordList], indent=4) # print(wordListJson) with open('../src/vocabulary.json', 'w') as outfile: json.dump([ob.__dict__ for ob in wordList], outfile, indent=4) print('Vocabulary made into json') with open('../src/poslist.json', 'w') as outfile: json.dump(partOfSpeechList, outfile, indent=4) with open('../src/lessonlist.json', 'w') as outfile: json.dump(lessonList, outfile, indent=4) quit() return wordList, partOfSpeechList, lessonList
value = value.strip("-") pos_list.append(value) for p in pos_list: j = PartOfSpeech.get(p) if(j == True): plist.append(p) if(len(plist) == 0): return None else: return plist else: return None k = kakasi() # Generate kakasi instance k.setMode('J', 'H') #漢字からひらがなに変換 k.setMode('K', 'H') conv = k.getConverter() def English_to_Kana(str, fname): read = str.strip() english = re.compile('[a-zA-Z]+') words = english.findall(read) if((len(words) >= 1) & (fname == "EngDict_")): for w in words: if(len(w) == 1): furigana = alphabet.get(w.upper()) read = read.replace(w, furigana) else: count = 0
def __init__(self): self.kakasi = kakasi()
#!/usr/bin/env/python3 import os from typing import List import marisa_trie from pykakasi import kakasi kakasi_py = kakasi() kakasi_py.setMode('H', 'a') kakasi_py.setMode('K', 'a') kakasi_py.setMode('J', 'a') conv = kakasi_py.getConverter() def _converting_to_roman(word: str) -> str: """ 与えられた単語をローマ字に変換して返す """ return conv.do(word) def _fetch_vowel(word: str) -> str: """ 母音のみの文字列にして返す """ vowel_str = "" for i in _converting_to_roman(word): if i in "aiueo": vowel_str += i return vowel_str def _find_rhyme_words(word: str) -> List: """ 韻を踏んでいそうな単語を探索してリストで返す """ files = os.listdir("./dictionary/")
def test_kakasi_structured_constitution(): original_text = "日本国民は、正当に選挙された国会における代表者を通じて行動し、われらとわれらの子孫のために、" \ "諸国民との協和による成果と、わが国全土にわたつて自由のもたらす恵沢を確保し、" \ "政府の行為によつて再び戦争の惨禍が起ることのないやうにすることを決意し、ここに主権が国民に存することを宣言し、" \ "この憲法を確定する。そもそも国政は、国民の厳粛な信託によるものであつて、その権威は国民に由来し、" \ "その権力は国民の代表者がこれを行使し、その福利は国民がこれを享受する。これは人類普遍の原理であり、" \ "この憲法は、かかる原理に基くものである。われらは、これに反する一切の憲法、法令及び詔勅を排除する。" expected = [ { 'orig': "日本国民", 'kana': "ニホンコクミン", 'hira': "にほんこくみん", 'hepburn': "nihonkokumin", 'kunrei': "nihonkokumin", 'passport': "nihonkokumin" }, { 'orig': "は、", 'kana': "ハ、", 'hira': "は、", 'hepburn': "ha,", 'kunrei': "ha,", 'passport': "ha," }, { 'orig': "正当", 'kana': "セイトウ", 'hira': "せいとう", 'hepburn': "seitou", 'kunrei': "seitou", 'passport': "seito" }, { 'orig': "に", 'kana': "ニ", 'hira': "に", 'hepburn': "ni", 'kunrei': "ni", 'passport': "ni" }, { 'orig': "選挙", 'kana': "センキョ", 'hira': "せんきょ", 'hepburn': "senkyo", 'kunrei': "senkyo", 'passport': "senkyo" }, { 'orig': "された", 'kana': "サレタ", 'hira': "された", 'hepburn': "sareta", 'kunrei': "sareta", 'passport': "sareta" }, { 'orig': "国会", 'kana': "コッカイ", 'hira': "こっかい", 'hepburn': "kokkai", 'kunrei': "kokkai", 'passport': "kokkai" }, { 'orig': "における", 'kana': "ニオケル", 'hira': "における", 'hepburn': "niokeru", 'kunrei': "niokeru", 'passport': "niokeru" }, { 'orig': "代表者", 'kana': "ダイヒョウシャ", 'hira': "だいひょうしゃ", 'hepburn': "daihyousha", 'kunrei': "daihyousya", 'passport': "daihyousha" }, { 'orig': "を", 'kana': "ヲ", 'hira': "を", 'hepburn': "wo", 'kunrei': "wo", 'passport': "wo" }, { 'orig': "通じ", 'kana': "ツウジ", 'hira': "つうじ", 'hepburn': "tsuuji", 'kunrei': "tuuzi", 'passport': "tsuuji" }, { 'orig': "て", 'kana': "テ", 'hira': "て", 'hepburn': "te", 'kunrei': "te", 'passport': "te" }, { 'orig': "行動", 'kana': "コウドウ", 'hira': "こうどう", 'hepburn': "koudou", 'kunrei': "koudou", 'passport': "kodou" }, { 'orig': "し、", 'kana': "シ、", 'hira': "し、", 'hepburn': 'shi,', 'kunrei': "si,", 'passport': "shi," }, { 'orig': "われらとわれらの", 'kana': "ワレラトワレラノ", 'hira': "われらとわれらの", 'hepburn': "wareratowarerano", 'kunrei': "wareratowarerano", 'passport': "wareratowarerano" }, { 'orig': "子孫", 'kana': "シソン", 'hira': "しそん", 'hepburn': "shison", 'kunrei': "sison", 'passport': "shison" }, { 'orig': "のために、", 'kana': "ノタメニ、", 'hira': "のために、", 'hepburn': "notameni,", 'kunrei': "notameni,", 'passport': "notameni," }, { 'orig': "諸国民", 'kana': "ショコクミン", 'hira': "しょこくみん", 'hepburn': "shokokumin", 'kunrei': "syokokumin", 'passport': "shokokumin" }, { 'orig': "との", 'kana': "トノ", 'hira': "との", 'hepburn': "tono", 'kunrei': "tono", 'passport': "tono" }, { 'orig': "協和", 'kana': "キョウワ", 'hira': "きょうわ", 'hepburn': "kyouwa", 'kunrei': "kyouwa", 'passport': "kyouwa" }, { 'orig': "による", 'kana': "ニヨル", 'hira': "による", 'hepburn': "niyoru", 'kunrei': "niyoru", 'passport': "niyoru" }, { 'orig': "成果", 'kana': "セイカ", 'hira': "せいか", 'hepburn': "seika", 'kunrei': "seika", 'passport': "seika" }, { 'orig': "と、", 'kana': "ト、", 'hira': "と、", 'hepburn': "to,", 'kunrei': "to,", 'passport': "to," }, { 'orig': "わが", 'kana': "ワガ", 'hira': "わが", 'hepburn': "waga", 'kunrei': "waga", 'passport': "waga" }, { 'orig': "国", 'kana': "クニ", 'hira': "くに", 'hepburn': "kuni", 'kunrei': "kuni", 'passport': "kuni" }, { 'orig': "全土", 'kana': "ゼンド", 'hira': "ぜんど", 'hepburn': "zendo", 'kunrei': "zendo", 'passport': "zendo" }, { 'orig': "にわたつて", 'kana': "ニワタツテ", 'hira': "にわたつて", 'hepburn': "niwatatsute", 'kunrei': "niwatatute", 'passport': "niwatatsute" }, { 'orig': "自由", 'kana': "ジユウ", 'hira': "じゆう", 'hepburn': "jiyuu", 'kunrei': "ziyuu", 'passport': "jiyuu" }, { 'orig': "のもたらす", 'kana': "ノモタラス", 'hira': 'のもたらす', 'hepburn': "nomotarasu", 'kunrei': "nomotarasu", 'passport': "nomotarasu" }, { 'orig': "恵沢", 'kana': "ケイタク", 'hira': "けいたく", 'hepburn': "keitaku", 'kunrei': "keitaku", 'passport': "keitaku" }, { 'orig': "を", 'kana': "ヲ", 'hira': "を", 'hepburn': 'wo', 'kunrei': 'wo', 'passport': 'wo' }, { 'orig': "確保", 'kana': "カクホ", 'hira': "かくほ", 'hepburn': "kakuho", 'kunrei': "kakuho", 'passport': "kakuho" }, { 'orig': "し、", 'kana': "シ、", 'hira': "し、", 'hepburn': "shi,", 'kunrei': "si,", 'passport': "shi," }, { 'orig': "政府", 'kana': "セイフ", 'hira': "せいふ", 'hepburn': "seifu", 'kunrei': "seifu", 'passport': "seifu" }, { 'orig': "の", 'kana': "ノ", 'hira': "の", 'hepburn': "no", 'kunrei': "no", 'passport': "no" }, { 'orig': "行為", 'kana': "コウイ", 'hira': "こうい", 'hepburn': "koui", 'kunrei': "koui", 'passport': "koi" }, { 'orig': "によつて", 'kana': "ニヨツテ", 'hira': "によつて", 'hepburn': "niyotsute", 'kunrei': "niyotute", 'passport': "niyotsute" }, { 'orig': "再び", 'kana': "フタタビ", 'hira': "ふたたび", 'hepburn': "futatabi", 'kunrei': "futatabi", 'passport': "futatabi" }, { 'orig': "戦争", 'kana': "センソウ", 'hira': "せんそう", 'hepburn': "sensou", 'kunrei': "sensou", 'passport': "senso" }, { 'orig': "の", 'kana': "ノ", 'hira': "の", 'hepburn': "no", 'kunrei': "no", 'passport': "no" }, ] kakasi = pykakasi.kakasi() result = kakasi.convert(original_text) for i, e in enumerate(expected): assert result[i]['orig'] == e['orig'] assert result[i]['hira'] == e['hira'] assert result[i]['kana'] == e['kana'] assert result[i]['hepburn'] == e['hepburn'] assert result[i]['kunrei'] == e['kunrei'] assert result[i]['passport'] == e['passport']
# coding=UTF-8 import pykakasi, json, argparse, codecs, traceback from pathlib import Path kks = pykakasi.kakasi() parser = argparse.ArgumentParser( description= 'Takes an LLN-exported Japanese Json file and outputs an anki CSV file containing word, pronounciation, meaning, and subtitle.' ) parser.add_argument('files', metavar='N', type=str, nargs='+', help='Filepaths to plaintext UTF-8 JSON files.') parser.add_argument( '--outdir', type=str, nargs=1, help='File output directory. Default are the same as input paths.') args = parser.parse_args() for arg in vars(args): if arg != 'files': continue for file in getattr(args, arg):
#!/usr/local/bin python3 import os import sys from argparse import ArgumentParser import numpy as np import json from tqdm import tqdm from station import station_generator from match_score import term_match_score from speech_sound import consonants, vowels from preprocess import preprocess from pykakasi import kakasi kakasi_h2a = kakasi() kakasi_h2a.setMode('H', 'a') conv_h2a = kakasi_h2a.getConverter() kakasi_j2h = kakasi() kakasi_j2h.setMode('J', 'H') conv_j2h = kakasi_j2h.getConverter() def initial2initials(initial, initials): # 歌詞のイニシャルから検索対象の単語のイニシャルを指定 (母音が共通のもの) return [ i for i in initials if vowels(conv_h2a.do(initial)) == vowels(conv_h2a.do(i)) ] def initial2stations(stations):
from pykakasi import kakasi from janome.tokenizer import Tokenizer # pykakasi kks = kakasi() convert = kks.convert # janome t = Tokenizer() # 处理日文字符 '''特殊符号''' symbols = ('、', '。', '’', '”', '{', '}', '「', '」', 'ー', '=', '_', '+', '/', '*', '-', '(', ')') # 日文处理函数 def dealwith(jp): for token in t.tokenize(jp): string = str(token) origin = string.split('\t')[0] if string.split(',')[-1] != '*': roma = convert(string.split(',')[-1])[0]['hepburn'] else: roma = convert(origin)[0]['hepburn'] result_roma = roma + ' ' return result_roma result = dealwith(input())
import json import pykakasi from selenium import webdriver from selenium.webdriver.support.ui import Select from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC import lxml.html import os kakasi = pykakasi.kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--no-sandbox') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--window-size=1280x1696') chrome_options.add_argument('--user-data-dir=/tmp/user-data') chrome_options.add_argument('--hide-scrollbars') chrome_options.add_argument('--enable-logging') chrome_options.add_argument('--log-level=0') chrome_options.add_argument('--v=99') chrome_options.add_argument('--single-process') chrome_options.add_argument('--data-path=/tmp/data-path') chrome_options.add_argument('--ignore-certificate-errors')
import requests from bs4 import BeautifulSoup import re import os localFile = './testLyrics.txt' lyricRoot = "../../distribution/llsif-waifu-lyrics" rootURL = 'http://love-live.wikia.com' transDict = {u'君':u'きみ', u'見':u'み', u'色':u'いろ', u'来':u'き'} segmenter = tinysegmenter.TinySegmenter() kakasi = kakasi() enCount = 0 jpCount = 0 siteTabList = ['rōmaji','kanji','english'] def iterateSongList(urlRead='http://love-live.wikia.com/wiki/Category:Aqours_Songs'): r = requests.get(urlRead).content soup = BeautifulSoup(r,'lxml') contDiv = soup.find("div",{"class":"mw-content-ltr"}) for ulTag in contDiv.findAll("ul"): for liTag in ulTag.findAll("li"): title = None try: title = liTag.find("a").find("img")["alt"]