def dabiaoqian(path): from pykakasi import kakasi import csv, os name_tezheng = 'mizhichuli_log' # 装有特征值的那个文件的文件名 xinde = 'xinde_mizhichuli' # 装入新的特征值的文件名 name1 = 'align1' name2 = 'symbol.txt' #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for i in os.listdir(path): path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka') path_tezheng = os.path.join(path_1, name_tezheng) #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP')) # 把标志文件读进来 biaozhiwenjian = csv.reader( open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件 biaozhiwenjian_1 = [i for i in biaozhiwenjian ] # 转化为list,但是内容是list里面套list #[['id: l_8840_9810_T1_F_01'],['REF: そう です か 、 はい 。 '],['HYP: そう です か はい 。 '],['EVAL: C C C D C C '],[],['id: l_10800_13190_T1_F_01']] # print(biaozhiwenjian_1) # os.system('pause') path_xinde = os.path.join(path_1, xinde) mulu.mkdir(path_xinde) for i in range(0, len(biaozhiwenjian_1)): # 这里的每一轮可以为一个语音文件打标签 try: biaozhi = biaozhiwenjian_1[i][0] except: continue if 'id:' in biaozhi: ID = '' l_biaozhi = [] l_zhengjie = [] l_zhengjie_1 = [] l_jieguo = [] l_jieguo_1 = [] ID = biaozhiwenjian_1[i][0].replace('id: ', '') l_zhengjie = biaozhiwenjian_1[i + 1][0].split() l_zhengjie.pop(0) l_jieguo = biaozhiwenjian_1[i + 2][0].split() l_jieguo.pop(0) l_biaozhi = biaozhiwenjian_1[i + 3][0].split() l_biaozhi.pop(0) #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空 jishuqi_jieguo = 0 jishuqi_zhengjie = 0 jishuqi_biaozhi = 0 for i in l_biaozhi: if i == "D": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append('') jishuqi_zhengjie += 1 jishuqi_biaozhi += 1 if i == "C": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "I": l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) l_zhengjie_1.append('') jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "S": #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie] jieguo_hanzi = l_jieguo[jishuqi_jieguo] #先处理识别结果 if conv.do( jieguo_hanzi ) == jieguo_hanzi and jieguo_hanzi != '、': #判断是不是字母 try: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(jieguo_hanzi))) except: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( jieguo_hanzi)) else: zhuanhuan_jieguo = conv.do(jieguo_hanzi) #再处理正解文 if conv.do( zhengjie_hanzi ) == zhengjie_hanzi and zhengjie_hanzi != '、': # 判断是不是字母 try: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(zhengjie_hanzi))) except: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( zhengjie_hanzi)) else: zhuanhuan_zhengjie = conv.do(zhengjie_hanzi) if zhuanhuan_jieguo == zhuanhuan_zhengjie: # print("正解list") # print(l_zhengjie_1) # # print("识别结果list") # print(l_jieguo_1) # # print("zhuanhuan_jieguo") # print(zhuanhuan_jieguo) # print("zhuanhuan_zhengjie") # print(zhuanhuan_zhengjie) # print("有标志被改了") # print(ID) # os.system("pause") l_biaozhi[jishuqi_biaozhi] = 'C' jishuqi_biaozhi += 1 jishuqi_zhengjie += 1 jishuqi_jieguo += 1 # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) # os.system('pause') path_out_1 = os.path.join(path_out, ID + '.out') #读出.out文件 dianout = pi.read_out(path_out_1) start = dianout.pop(0)[1][1] # 给开始的无音区间打标签9,pop掉第一个元素 start_1 = dianout[-1][1][0] #给末尾句号打标签9 # end_1 = dianout.pop(-1)[1][1] # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv') tezhengzhi = csv.reader( open(path_tezheng_1, 'r', encoding='utf-8')) t_file_list = [i for i in tezhengzhi] end_1 = len(t_file_list) - 1 for i in range(start + 1): t_file_list[i].insert(0, '9') # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for i in range(start_1, end_1 + 1): t_file_list[i].insert(0, '9') l_jieguo_1.pop(-1) #最后句号的部分已经打过标签了,需要把它pop掉 print("ID") print(ID) print("l_biaozhi") print(l_biaozhi) print("l_jieguo_1") print(l_jieguo_1) print("dianout") print(dianout) dianout_chongzao = cz.chongzao(l_biaozhi, l_jieguo_1, dianout, ID) # 生成新的dianoutlist,以后就靠它了 print('dianout_chongzao') print(dianout_chongzao) #通过得到的新的list,开始打标签 # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'], # ['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'], # ['で', [191, 209], 'C']] for i in dianout_chongzao: start, end = i[1] if i[2] == 'C': for i in range(start, end + 1): t_file_list[i].insert(0, '0') else: for i in range(start, end + 1): t_file_list[i].insert(0, '1') path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv') with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(path_xinde) # 把有标记9的特征值全部都删除掉
def dabiaoqian(path): from pykakasi import kakasi import csv, os name_tezheng = 'log' # 装有特征值的那个文件的文件名 xinde = 'xinde_log' # 装入新的特征值的文件名 houzhui = '.wav.csv' # 特征值文件中除去id号之后的后缀部分 name = 'align1' # 表记着CCCCSSSS标志的文件 name1 = 'align1' name2 = 'align1.txt' shibiejieguo = {} # 安放识别结果的字典 symbolcidian = {} # 这样的词典,标志词典 # id: C001L_086 # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C'] # id: C001L_087 # ['S', 'D', 'D', 'C'] # id: C001L_088 # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C'] zhengjie = {} # 正解文词典 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for i in os.listdir(path): path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka') path_tezheng = os.path.join(path_1, name_tezheng) biaozhiwenjian = csv.reader( open(os.path.join(path_1, name1), 'r', encoding='EUC-JP')) # 把标志文件读进来 # biaozhiwenjian = csv.reader(open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件 biaozhiwenjian_1 = [i for i in biaozhiwenjian ] # 转化为list,但是内容是list里面套list #[['id: l_8840_9810_T1_F_01'],['REF: そう です か 、 はい 。 '],['HYP: そう です か はい 。 '],['EVAL: C C C D C C '],[],['id: l_10800_13190_T1_F_01']] # print(biaozhiwenjian_1) # os.system('pause') path_xinde = os.path.join(path_1, xinde) mulu.mkdir(path_xinde) for i in range(0, len(biaozhiwenjian_1), 5): #这里的每一轮可以为一个语音文件打标签 ID = '' l_biaozhi = [] l_zhengjie = [] l_zhengjie_1 = [] l_jieguo = [] l_jieguo_1 = [] ID = biaozhiwenjian_1[i][0].replace('id: ', '') l_zhengjie = biaozhiwenjian_1[i + 1][0].split() l_zhengjie.pop(0) l_jieguo = biaozhiwenjian_1[i + 2][0].split() l_jieguo.pop(0) l_biaozhi = biaozhiwenjian_1[i + 3][0].split() l_biaozhi.pop(0) # try: # ID = biaozhiwenjian_1[i].replace('id: ', '') # # l_zhengjie = biaozhiwenjian_1[i+1].split() # l_zhengjie.pop(0) # # l_jieguo = biaozhiwenjian_1[i+2].split() # l_jieguo.pop(0) # # l_biaozhi = biaozhiwenjian_1[i+3].split() # l_biaozhi.pop(0) # # except: # print(biaozhiwenjian_1[i]) # os.system("pause") #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空 jishuqi_jieguo = 0 jishuqi_zhengjie = 0 for i in l_biaozhi: if i == "D": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append('') jishuqi_zhengjie += 1 if i == "C": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 if i == "I": l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) l_zhengjie_1.append('') jishuqi_jieguo += 1 if i == "S": l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) jishuqi_zhengjie += 1 jishuqi_jieguo += 1 # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) # os.system('pause') path_out_1 = os.path.join(path_out, ID + '.out') dianout = pi.read_out(path_out_1) # print(dianout) # os.system('pause') path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv') tezhengzhi = csv.reader(open(path_tezheng_1, 'r', encoding='utf-8')) t_file_list = [i for i in tezhengzhi] dimension = len(t_file_list[0]) start = dianout.pop(0)[1][1] #给开始的无音区间打标签9 for i in range(start + 1): t_file_list[i].insert(0, '9') # 最前面的无音区间全部都打标签9,把它们当做正确认识来处理 zhenshubiao = {} #给每个单词都建立一个词典 for i in dianout: zhenshubiao[i[0]] = i[1] #一个.out文件中的每个单词都建立一个对应的字典 start, end = zhenshubiao['。'] #给最后面的句号的部分打上标签9 for i in range(start, end + 1): t_file_list[i].insert(0, '9') # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] # while 'D' in l_biaozhi: # l_biaozhi.remove('D') # 一次性只会删除一个D,所以要用while l_biaozhi_1 = [i for i, x in enumerate(l_biaozhi) if x == 'S'] # 返回标志S的索引 # print(len(l_biaozhi_1)) # os.system('pause') if len(l_biaozhi_1) != 0: #如果l_biaozhi_1里面没有单词,说明全部都被正确认识了 # print('l_jieguo_1') # print(l_jieguo_1) # # print('l_biaozhi_1') # print(l_biaozhi_1) # # print('l_biaozhi') # print(l_biaozhi) # # print('l_zhengjie_1') # print(l_zhengjie_1) # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) for y in l_biaozhi_1: #处理标志s对应的单词,把正解文和识别结果都转化为字母再比较一次 # print("现在输出y的值") # print(y) # # print('现在输出l_jieguo_1[y]') # print(l_jieguo_1[y]) # print(ID) # os.system('pause') #先处理识别结果 if conv.do( l_jieguo_1[y] ) == l_jieguo_1[y] and l_jieguo_1[y] != '、': #判断是不是字母 try: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(l_jieguo_1[y]))) except: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( l_jieguo_1[y])) else: zhuanhuan_jieguo = conv.do(l_jieguo_1[y]) #再处理正解文 if conv.do( l_zhengjie_1[y] ) == l_zhengjie_1[y] and l_zhengjie_1[y] != '、': # 判断是不是字母 try: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(l_zhengjie_1[y]))) except: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( l_zhengjie_1[y])) else: zhuanhuan_zhengjie = conv.do(l_zhengjie_1[y]) # print('l_jieguo_1[y]') # print(l_jieguo_1[y]) # os.system('pause') guanjianzi = l_jieguo_1[y] #把S对应的单词取出来 # print('guanjianzi') # print(guanjianzi) # os.system('pause') # # print('zhenshubiao') # print(zhenshubiao[guanjianzi]) # os.system('pause') try: start, end = zhenshubiao[guanjianzi] #把这个单词对应的帧数范围取出来 except: print('ID') print(ID) print('zhenshubiao') print(zhenshubiao) print('guanjianzi') print(guanjianzi) os.system('pause') for i in range(start, end + 1): if zhuanhuan_jieguo == zhuanhuan_zhengjie: t_file_list[i].insert(0, '0') else: t_file_list[i].insert(0, '1') jishuqi_tezhengzhi = 0 for i in t_file_list: #给被正确识别的单词打标签0 # if i[0] != '0' and i[0] != '1' and i[0] != '9': if len(i[0]) == dimension: t_file_list[jishuqi_tezhengzhi].insert(0, '0') jishuqi_tezhengzhi += 1 path_xinde_tezhengzhi = os.path.join(path_xinde, ID + '.csv') with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(path_xinde) # 把有标记9的特征值全部都删除掉
def dabiaoqian(path): from pykakasi import kakasi BASE_DIRS = path # 批次 name_tezheng = 'log' # 装有特征值的那个文件的文件名 xinde = 'xinde_log' # 装入新的特征值的文件名 houzhui = '.wav.csv' # 特征值文件中除去id号之后的后缀部分 name = 'align1' # 表记着CCCCSSSS标志的文件 shibiejieguo = {} # 安放识别结果的字典 symbolcidian = {} # 这样的词典,标志词典 # id: C001L_086 # ['S', 'S', 'S', 'C', 'S', 'D', 'D', 'D', 'C'] # id: C001L_087 # ['S', 'D', 'D', 'C'] # id: C001L_088 # ['S', 'S', 'S', 'S', 'D', 'D', 'D', 'D', 'C', 'C'] zhengjie = {} # 正解文词典 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for per_dirs in os.listdir(BASE_DIRS): # per_dirs = C001L,C001R... d_9 = os.path.join(BASE_DIRS,per_dirs,xinde) d = os.path.join(BASE_DIRS,per_dirs,xinde) mulu.mkdir(d) zhengjie,symbolcidian = zidian.zidian(per_dirs,BASE_DIRS) #从标志文件中把标志塞进symbolcidian字典里 for id in os.listdir(os.path.join(BASE_DIRS,per_dirs,name_tezheng)):#id = C001L,C001R下面的文件的名字 banyun_1 = []#存储C的索引 banyun_2 = []#存储正确的单词 banyun_3 = []#存储非C的索引 banyun_4 = []#存储暂时不正确的单词的拼音 dianout = [] id = id.replace(houzhui, '')#把文件名中的.wav.csv去掉只剩id # print(id) # print(symbolcidian[id]) enumerate(symbolcidian[id]) banyun_1 = [i for i,x in enumerate(symbolcidian[id]) if x == 'C']#返回标志C的索引 banyun_3 = [i for i,x in enumerate(symbolcidian[id]) if x == 'S']#返回替换错误的单词的索引 t_file = os.path.join(BASE_DIRS, per_dirs, name_tezheng, id + houzhui) a = csv.reader(open(t_file, 'r', encoding='utf-8')) t_file_list = [i for i in a] # if len(banyun_1) == 0:#如果没有一个是正确的,全错,所有的数据都打标签1 # for i in range(len(t_file_list)): # t_file_list[i].insert(0, '1') # print(banyun_1) # print(banyun_3) # os.system("pause") for u in banyun_1:#banyun_1里面装的全是标志C的索引 if u+1 <= len(zhengjie[id]):#正解文单词的个数可能没有标志的个数多 # print(banyun_1) # print(zhengjie[id][u]) # print(zhengjie[id]) # print("已经把正确单词 %s 加入数组"%str(zhengjie[id][u])) banyun_2.append(zhengjie[id][u])#banyun_2是存储正确单词的索引的数组 # print("此时的banyun_2是") # print(banyun_2) # os.system('pause') else:#如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了 print("手动调一下这个文件吧%s"%id) print("它的正确单词是") print(banyun_2) os.system("pause") # print(banyun_2) # os.system('pause') for w in banyun_3:#存储非C的索引 if w + 1 <= len(zhengjie[id]): # 正解文单词的个数可能没有标志的个数多 result = conv.do(zhengjie[id][w]) banyun_4.append(result) # if result == zhengjie[id][w] and zhengjie[id][w] != '、':#如果是逗号,也按正常的单词处理 # # banyun_4.append(conv.do(_make_kana_convertor(strQ2B(zhengjie[id][w]))))#如果转化之后的值不变,就说明遇到了字母,把字母转化为半角,再再转化为片假名,之后再转化为罗马字加入列表中 # else: # # banyun_4.append(result)#存储暂时不正确的单词 # print("此时的banyun_4是") # print(banyun_4) # os.system('pause') else: # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了 print("手动调一下这个文件吧%s" % id) print("它的认识出现错误的单词是") print(banyun_4) os.system("pause") # print(banyun_2) # os.system("pause") # for p in symbolcidian[id]: # os.system("pause") # # while p == 'C': # print(p.index('C')) dir_out = os.path.join(BASE_DIRS, per_dirs, 'keka',id + '.out') dianout = pi.read_out(dir_out)#提取出来的帧号跟julius识别结果一样 # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] # [ 37 58] 0.562999 で+接続詞 [で] start = dianout.pop(0)[1][1] # print(start) for i in range(start+1): t_file_list[i].insert(0, '9')#最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for y in dianout:#dianout是识别结果跟对应的帧数表 # print("此时的单词是%s"%y) # print("此时的匹配结果是") # print(dianout) # os.system("pause") if y[1][1]+1 <= len(t_file_list):#判断这个单词的范围是否超出了特征值得总行数 if y[0] == '':#跳过前面的无音区 continue if y[0] == dianout[-1][0]:#这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,下面也有一段代码 start, end = y[1] for i in range(start, end + 1): t_file_list[i].insert(0, '9') continue if y[0] in banyun_2:#如果这个单词存在列表banyun_2中,就给这个单词对应的帧数范围打标签0 start, end = y[1] print("正在为文件 %s 的单词 %s 打标签"%(os.path.split(dir_out)[1],y[0])) for i in range(start, end+1): t_file_list[i].insert(0, '0') banyun_2.remove(y[0])#打完标签0之后再从列表中把这个单词删掉 elif conv.do(y[0]) == y[0] and y[0] != '、':#如果是字母的话,转化之后还是字母 print("发现识别结果中的字母%s"%y[0]) print("它在文件%s"%dir_out) try: zhuanhuazhi = conv.do(make_kana_convertor._make_kana_convertor(strQ2B.strQ2B(y[0]))) except: zhuanhuazhi =conv.do(make_kana_convertor._make_kana_convertor(y[0])) if zhuanhuazhi in banyun_4:#需要先把字母转化为片假名然后再转化为读音 print("转化之后的字母为%s"%zhuanhuazhi) # os.system('pause') start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove(zhuanhuazhi) # 打完标签0之后再从列表中把这个单词删掉 else: start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '1') elif conv.do(y[0]) in banyun_4: start, end = y[1] print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove(conv.do(y[0])) # 打完标签0之后再从列表中把这个单词删掉 else: start, end = y[1]#如果这个单词不在列表banyun_2中,就给这个单词对应的帧数范围打标签1 print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start , end+1): t_file_list[i].insert(0, '1') elif y[1][1]+1 > len(t_file_list): if y[0] == '': continue if y[0] == dianout[-1][0]: start = y[1][0] end = len(t_file_list) for i in range(start, end):#如果是y[1][1]+1 > len(t_file_list)的情况这里end就不能加一了 t_file_list[i].insert(0, '9') continue # 这段代码是为了把最后句号的部分全部打上标签9而设置的注意一下,上面也有一段代码 if y[0] in banyun_2: start = y[1][0] end = len(t_file_list)#如果这个单词的帧数表的范围超出了特征值得行数,就以特征值行数作为end print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end): t_file_list[i].insert(0, '0') banyun_2.remove(y[0]) elif conv.do(y[0]) == y[0] and y[0] != '、': # 如果是字母的话,转化之后还是字母 if conv.do(make_kana_convertor._make_kana_convertor(y[0])) in banyun_4: # 需要先把字母转化为片假名然后再转化为读音 start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '0') banyun_4.remove(conv.do(make_kana_convertor._make_kana_convertor(y[0]))) # 打完标签0之后再从列表中把这个单词删掉 else: start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end + 1): t_file_list[i].insert(0, '1') else: start = y[1][0] end = len(t_file_list) print("正在为文件 %s 的单词 %s 打标签" % (os.path.split(dir_out)[1], y[0])) for i in range(start, end): t_file_list[i].insert(0, '1') with open(os.path.join(BASE_DIRS, per_dirs,xinde,id+'.csv') , 'w+',encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(d_9)#把有标记9的特征值全部都删除掉
else: # 如果C标志的索引号大于正解文单词的索引号,那就只能手动去调整了 print("手动调一下这个文件吧%s" % id) print("它的认识出现错误的单词是") print(banyun_4) os.system("pause") # print(banyun_2) # os.system("pause") # for p in symbolcidian[id]: # os.system("pause") # # while p == 'C': # print(p.index('C')) dir_out = os.path.join(BASE_DIRS, per_dirs, 'keka', id + '.out') dianout = pi.read_out(dir_out) #提取出来的帧号跟julius识别结果一样 # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] # [ 37 58] 0.562999 で+接続詞 [で] start = dianout.pop(0)[1][1] # print(start) for i in range(start + 1): t_file_list[i].insert(0, '9') #最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for y in dianout: #dianout是识别结果跟对应的帧数表 # print("此时的单词是%s"%y)
def changpoyin(data,files_dir,i):#把长破音都转化为u,这里的i是文件的id from pykakasi import kakasi # 把单词转化为音素 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() data_1 = copy.deepcopy(data) data_2 = [] files_dir_1 = os.path.join(files_dir.replace('_yinsu', '')) data_danci = pipei.read_out(files_dir_1) # 单词级别的识别结果 for danci in data_1:#每次循环检查一个音素(音素単位) if ':' in danci[0]:#如果识别结果中出现冒号“:”就把单词单位的识别结果读出来看看到底是“u”还是“-” zhenshu = danci[1][0] for danci_1 in data_danci:#每一个循环查看一个单词(単語単位) if zhenshu >= danci_1[1][0] and zhenshu <= danci_1[1][1]: # 找到这个音素对应的汉字 # print('能找到') tanngou = conv.do(danci_1[0]) # 把这个汉字取出进行转化 if danci_1[0] == tanngou: # 说明是字母 tanngou = ztok._make_kana_convertor(danci_1[0]) if tanngou[-1] == 'u' or tanngou=='nado':#如果结尾是u那就把识别结果里的:转化为u fenjie = (danci[1][0] + danci[1][1])//2 danci_2 = copy.deepcopy(danci) danci_2[1][1] = fenjie danci_2[0] = danci_2[0].replace(':','') danci_3 = copy.deepcopy(danci) danci_3[1][0] = fenjie + 1 danci_3[0] = 'u' data_2.append(danci_2) data_2.append(danci_3) #这段代码等做过第一波实验之后再加进去(判断冒号之后是i的情况) ################################################################ # elif tanngou[-1] == 'i':#如果结尾是u那就把识别结果里的:转化为u # # fenjie = (danci[1][0] + danci[1][1])//2 # danci_2 = copy.deepcopy(danci) # danci_2[1][1] = fenjie # danci_2[0] = danci_2[0].replace(':','') # danci_3 = copy.deepcopy(danci) # danci_3[1][0] = fenjie + 1 # danci_3[0] = 'i' # data_2.append(danci_2) # data_2.append(danci_3) ############################################################### elif zifudingwei(tanngou,danci[0].replace(':',''),files_dir_1) == 'u':#把有冒号字母后面的那个字母单独拿出来 fenjie = (danci[1][0] + danci[1][1])//2 danci_2 = copy.deepcopy(danci) danci_2[1][1] = fenjie danci_2[0] = danci_2[0].replace(':','') danci_3 = copy.deepcopy(danci) danci_3[1][0] = fenjie + 1 danci_3[0] = 'u' data_2.append(danci_2) data_2.append(danci_3) else: data_2.append(danci) break else: data_2.append(danci)#如果不是包函冒号的音素,就直接加入新的list # print(i) # print(data_danci) # print(data) # print(data_2) # os.system('pause') return data_2
txtwenjian = csv.reader(open(csv_path, 'r', encoding='utf-8')) b = [i for i in txtwenjian] b_new = [] zhongzhuan = [] for name in os.listdir(path): a = name[6:9] zhengjie = b[int(a) - 1] #此时的zhengjie是一个list zhuanghuan = list(zhengjie[0]) zhuanghuan.extend('。')#给每一句话的最后都加上一个句号 zhuanghuan = ''.join(zhuanghuan) out_file = os.path.join(path,name) out_list = pipei.read_out(out_file) os.system("pause") # # print(b[int(a) - 1]) # #b[]是正解文 # newlist.append() m = 0 # print(out_list) # print(len(out_list)) new = list(zhuanghuan) # zhengjie[0]是一个字符串 for q in out_list: if q[0] == ''or q[0]=='。': pass else: # print(q[0]) # print(len(q[0]))
# coding=utf-8 #把文件夹里面的文件的名字都改了 import csv, os import pipei as pi path = r'C:\Users\a7825\Desktop\工作空间\语音数据\RWCP-SP96-要切\第一批 - 副本 (2)\C1_F_05\keka/l_1090_2300_C1_F_05.out' # # biaozhiwenjian = csv.reader(open(path, 'r', encoding='EUC-JP')) # 把标志文件读进来 # b = [i for i in biaozhiwenjian] # 转化为list # # print(b) # a=['72', 56, '76', 84, 80, 88] # print(a.index('76')) # for i in range(0,len(b),5): # print(b[i+4]) dianout = pi.read_out(path) #提取出来的帧号跟julius识别结果一样 print(dianout) start = dianout.pop(0)[1][1] print(start) os.system('pause') print(dianout)
def dabiaoqian(path, guanjianzi_1, guanjianzi_2): from pykakasi import kakasi #把单词转化为音素 import csv, os name_tezheng = guanjianzi_1 # 装有特征值的那个文件的文件名 xinde = guanjianzi_2 # 装入新的特征值的文件名 name1 = 'align1' name2 = 'symbol.txt' #标志文件的名字,当align1不好使的时候,换用symbol.txt,注意,下面的代码相应地也要换掉 kakasi = kakasi() kakasi.setMode("H", "a") # Hiragana to ascii, default: no conversion kakasi.setMode("K", "a") # Katakana to ascii, default: no conversion kakasi.setMode("J", "a") # Japanese to ascii, default: no conversion kakasi.setMode("r", "Hepburn") # default: use Hepburn Roman table kakasi.setMode("s", True) # add space, default: no separator conv = kakasi.getConverter() for i in os.listdir(path): path_1 = os.path.join(path, i) path_out = os.path.join(path_1, 'keka') path_tezheng = os.path.join(path_1, name_tezheng) #biaozhiwenjian = csv.reader(open(os.path.join(path_1, name1), 'r', encoding='EUC-JP')) # 把标志文件读进来 biaozhiwenjian = csv.reader( open(os.path.join(path_1, name2), 'r', encoding='utf-8')) #如果标志文件是.txt文件 biaozhiwenjian_1 = [i for i in biaozhiwenjian ] # 转化为list,但是内容是list里面套list #[['id: l_8840_9810_T1_F_01'],['REF: そう です か 、 はい 。 '],['HYP: そう です か はい 。 '],['EVAL: C C C D C C '],[],['id: l_10800_13190_T1_F_01']] # print(biaozhiwenjian_1) # os.system('pause') path_xinde = os.path.join(path_1, xinde) mulu.mkdir(path_xinde) for i in range(0, len(biaozhiwenjian_1)): # 这里的每一轮可以为一个语音文件打标签 try: biaozhi = biaozhiwenjian_1[i][0] except: continue if 'id:' in biaozhi: l_zhengjie_1 = [] l_jieguo_1 = [] ID = biaozhiwenjian_1[i][0].replace('id: ', '') l_zhengjie = biaozhiwenjian_1[i + 1][0].split() #取REF l_zhengjie.pop(0) l_jieguo = biaozhiwenjian_1[i + 2][0].split() #取HYP l_jieguo.pop(0) l_biaozhi = biaozhiwenjian_1[i + 3][0].split() #取EVAL l_biaozhi.pop(0) # try: # ID = biaozhiwenjian_1[i].replace('id: ', '') # # l_zhengjie = biaozhiwenjian_1[i+1].split() # l_zhengjie.pop(0) # # l_jieguo = biaozhiwenjian_1[i+2].split() # l_jieguo.pop(0) # # l_biaozhi = biaozhiwenjian_1[i+3].split() # l_biaozhi.pop(0) # # except: # print(biaozhiwenjian_1[i]) # os.system("pause") #建立严格对应的正解,识别结果,标记,如果标记是d的话,结果就是空 jishuqi_jieguo = 0 jishuqi_zhengjie = 0 jishuqi_biaozhi = 0 for i in l_biaozhi: if i == "D": #删除错误 l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append('') #发生删除错误,就在识别结果的列表里面加上一个空格 jishuqi_zhengjie += 1 jishuqi_biaozhi += 1 if i == "C": #正解 l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) #正确的话就在识别结果和正解文两个列表里面都加入单词 # print('l_jieguo') # print(l_jieguo) # os.system('pause') l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) # jishuqi_zhengjie += 1 jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "I": #插入错误 l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) l_zhengjie_1.append('') #发生插入错误,就在正解文的里面加入空格 jishuqi_jieguo += 1 jishuqi_biaozhi += 1 if i == "S": #如果是S的话特殊处理一下,转化为字母再比较,如果转化之后相等的话,把标志改为C l_zhengjie_1.append(l_zhengjie[jishuqi_zhengjie]) l_jieguo_1.append(l_jieguo[jishuqi_jieguo]) zhengjie_hanzi = l_zhengjie[jishuqi_zhengjie] jieguo_hanzi = l_jieguo[jishuqi_jieguo] #先处理识别结果 if conv.do( jieguo_hanzi ) == jieguo_hanzi and jieguo_hanzi != '、': #判断是不是字母 try: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(jieguo_hanzi))) except: zhuanhuan_jieguo = conv.do( make_kana_convertor._make_kana_convertor( jieguo_hanzi)) else: zhuanhuan_jieguo = conv.do(jieguo_hanzi) #再处理正解文 if conv.do( zhengjie_hanzi ) == zhengjie_hanzi and zhengjie_hanzi != '、': # 判断是不是字母 try: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( strQ2B.strQ2B(zhengjie_hanzi))) except: zhuanhuan_zhengjie = conv.do( make_kana_convertor._make_kana_convertor( zhengjie_hanzi)) else: zhuanhuan_zhengjie = conv.do(zhengjie_hanzi) if zhuanhuan_jieguo == zhuanhuan_zhengjie: # print("正解list") # print(l_zhengjie_1) # # print("识别结果list") # print(l_jieguo_1) # # print("zhuanhuan_jieguo") # print(zhuanhuan_jieguo) # print("zhuanhuan_zhengjie") # print(zhuanhuan_zhengjie) # print("有标志被改了") # print(ID) # os.system("pause") l_biaozhi[jishuqi_biaozhi] = 'C' jishuqi_biaozhi += 1 jishuqi_zhengjie += 1 jishuqi_jieguo += 1 # print(l_jieguo_1) # print(l_zhengjie_1) # print(l_biaozhi) # os.system('pause') path_out_1 = os.path.join(path_out, ID + '.out') #读出.out文件 dianout = pi.read_out(path_out_1) start = dianout.pop(0)[1][1] # 给开始的无音区间打标签9,pop掉第一个元素 start_1 = dianout[-1][1][0] #给末尾句号打标签9 # end_1 = dianout.pop(-1)[1][1] 因为在提取特征值的时候最后一帧可能被丢了,所以这个end就用t_file_list的条数代替 # print(dianout) # os.system('pause') # 最后的效果:[['', [0, 18]], ['お', [19, 24]], ['願い', [25, 49]], ['三', [50, 82]], ['。', [83, 86]]] path_tezheng_1 = os.path.join(path_tezheng, ID + '.wav.csv') tezhengzhi = csv.reader( open(path_tezheng_1, 'r', encoding='utf-8')) t_file_list = [i for i in tezhengzhi] end_1 = len(t_file_list) - 1 changdu = len(t_file_list[0]) #为了给音素上打标签做准备 if start < len(t_file_list): #如果.out文件的空白部分的帧数范围大于特征值的行数,就扔了 for i in range(start + 1): t_file_list[i].insert( 0, '9') #最前面的无音区间全部都打标签9,把它们当做正确认识来处理 for i in range(start_1, end_1 + 1): t_file_list[i].insert(0, '9') l_jieguo_1.pop(-1) #最后句号的部分已经打过标签了,需要把它pop掉 print("ID") print(ID) print("l_biaozhi") print(l_biaozhi) print("l_jieguo_1") print(l_jieguo_1) print("dianout") print(dianout) dianout_chongzao = cz.chongzao( l_biaozhi, l_jieguo_1, dianout, ID, l_zhengjie_1) # 生成新的dianoutlist,以后就靠它了 print('dianout_chongzao') print(dianout_chongzao) #通过得到的新的list,开始打标签 # [['災害', [3, 40], 'C'], ['で', [41, 48], 'C'], ['ござい', [49, 77], 'C'], ['ます', [78, 98], 'C'],['から', [99, 130], 'C'], ['、', [131, 152], 'C'], ['その', [153, 177], 'C'], ['場', [178, 190], 'C'],['で', [191, 209], 'C']] for i in dianout_chongzao: #想要在音素上进行比较,得看标签为S的部分 print(i) # print(i[0]) # print(i[1]) # print(i[2]) # os.system('pause') start, end = i[1] if i[2] == 'C': for b in range(start, end + 1): t_file_list[b].insert(0, '0') elif i[2] == 'S': if conv.do(i[0]) in conv.do( i[3]): #如果识别结果(scoring)包含在正解文的单词中就打标签0 for b in range(start, end + 1): t_file_list[b].insert(0, '0') elif conv.do(i[3]) in conv.do( i[0] ) and len(conv.do(i[3])) != len(conv.do( i[0])): #如果正解文单词包函在识别结果单词(scoring)中,就具体看音素 start_yinsu, end_yinsu = y.yinsu( conv.do(i[3]), start, end, ID, path_1) if start_yinsu == 0 or end_yinsu == 0: #如果跟本没有发现匹配的音素,就当做是完全识别错误,打标签1 for b in range(start, end + 1): t_file_list[b].insert(0, '1') else: for b in range(start_yinsu, end_yinsu + 1): t_file_list[b].insert(0, '1') for b in range(start, end + 1): if len(t_file_list[b]) == changdu: t_file_list[b].insert(0, '0') else: for b in range(start, end + 1): t_file_list[b].insert(0, '1') elif i[2] == 'I': #插入错误的话,如果这个单词跟它的前一个或者后一个单词音素相同的话,就算正确 if (conv.do(i[0]) in conv.do(i[3]) and conv.do(i[3]) != '') or ( conv.do(i[0]) in conv.do(i[4]) and conv.do(i[4]) != ''): for i in range(start, end + 1): t_file_list[i].insert(0, '0') elif conv.do(i[3]) in conv.do(i[0]) and len( conv.do(i[3]) ) != len(conv.do(i[0])) and conv.do( i[3] ) != '': #如果正解文单词包函在识别结果单词(scoring)中,就具体看音素 start_yinsu, end_yinsu = y.yinsu( conv.do(i[3]), start, end, ID, path_1) if start_yinsu == 0 or end_yinsu == 0: #如果跟本没有发现匹配的音素,就当做是完全识别错误,打标签1 for b in range(start, end + 1): t_file_list[b].insert(0, '1') else: for i in range(start_yinsu, end_yinsu + 1): t_file_list[i].insert(0, '1') for i in range(start, end + 1): if len(t_file_list[i]) == changdu: t_file_list[i].insert(0, '0') elif conv.do(i[4]) in conv.do(i[0]) and len( conv.do(i[4]) ) != len(conv.do(i[0])) and conv.do( i[4] ) != '': #如果正解文单词包函在识别结果单词(scoring)中,就具体看音素 start_yinsu, end_yinsu = y.yinsu( conv.do(i[4]), start, end, ID, path_1) #把一个单词中被正确识别的那几个音素以及这个单词的帧数范围传进去 if start_yinsu == 0 or end_yinsu == 0: #如果跟本没有发现匹配的音素,就当做是完全识别错误,打标签1 for b in range(start, end + 1): t_file_list[b].insert(0, '1') else: for i in range(start_yinsu, end_yinsu + 1): t_file_list[i].insert(0, '1') for i in range(start, end + 1): if len(t_file_list[i]) == changdu: t_file_list[i].insert(0, '0') else: for i in range(start, end + 1): t_file_list[i].insert(0, '1') else: for i in range(start, end + 1): t_file_list[i].insert( 0, '1') #标记为I或者S的单词的特征值需要打上标签1 path_xinde_tezhengzhi = os.path.join( path_xinde, ID + '.csv') with open(path_xinde_tezhengzhi, 'w+', encoding='utf-8') as mergen_file: for i in t_file_list: mergen_file.write('%s\n' % ','.join(i)) shanchu.shanchuhang(path_xinde) # 把有标记9的特征值全部都删除掉