def train_2_baidu(): out_file2 = codecs.open(_data_path + "/wav/S0002.txt", 'w', encoding="utf-8") for i, line in enumerate( open(_data_path + "/resource_aishell/aishell_transcript_v0.8.txt").readlines()): # cfg = { # 'spd': random.randint(4, 6), # 语速,取值0-9,默认为5中语速 # 'pit': random.randint(4, 7), # 音调,取值0-9,默认为5中语调 # 'vol': random.randint(4, 7), # 音量,取值0-15,默认为5中音量 # 'per': random.randint(0, 3) # 发音人选择, 0为女声,1为男声,3为情感合成-度逍遥,4为情感合成-度丫丫(不好),默认为普通女 # } file1, d = line.strip().split(" ", 1) if file1 >= "BAC009S0003W0121": continue cfg = {'spd': 5, 'pit': 5, 'vol': 5, 'per': 0} # d = json.loads(line.strip()) path = os.path.join( _data_path, "wav", file1[6:11], "%s_%s_%s_%s_%s.%s" % (file1, cfg.get("spd"), cfg.get("pit"), cfg.get("vol"), cfg.get("per"), "mp3")) gen_wav(d, cfg, path) AudioSegment.from_mp3(path).export(path[:-3] + "wav", format="wav") ps = generate_zi_label(d) lin = "{\"key\":\"" + path[:-3] + "wav" + "\", \"duration\": " + str( get_duration_wave(path[:-3] + "wav")) + ", \"text\":\"" + " ".join( ps).decode("utf-8") + "\"}" out_file2.write(lin + "\n") if (i + 1) % 100 == 0: out_file2.flush() out_file2.close()
def xiaoshuo_2_word(): d = set() for i, line in enumerate(open("resources/unicodemap_zi.csv").readlines()): d.add(line.rsplit(",", 1)[0]) DIR = "/export/aiplatform/8k/" out_file = open(DIR + 'resulttxtnew26.json', 'w') for i in glob.glob(DIR + "resulttxtnew26/*/*.wav"): txt = "".join([ line.strip() for line in open(i.replace("8k/", "")[:-3] + "txt").readlines() ]) txt = strQ2B(txt.strip().decode("utf8")).encode("utf8") ps = generate_zi_label(deletePunc(txt)) if len(ps) == 0: continue flag = False for p in ps: if p not in d or p.isdigit(): print("not in d is %s %s. %s" % (p, [p], "".join(ps))) flag = True break if flag: continue duration = get_duration_wave(i) if duration > 16: continue line = "{\"key\":\"" + i + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join(ps) + "\"}" out_file.write(line + "\n") out_file.close()
def client_2_word(): DIR = "/export/aiplatform/client_files4/" def compare(x, y): stat_x = os.stat(DIR + "/" + x) stat_y = os.stat(DIR + "/" + y) if stat_x.st_ctime < stat_y.st_ctime: return -1 elif stat_x.st_ctime > stat_y.st_ctime: return 1 else: return 0 # iterms = os.listdir(DIR) # iterms.sort(compare) # for iterm in iterms: # print(iterm) wavs = open(DIR + 'wav.txt').readlines() labels = open(DIR + 'label.txt').readlines() out_file = open(DIR + 'client4.json', 'w') for i, (path, txt) in enumerate(zip(wavs, labels)): ps = generate_zi_label( txt.replace(",", "").replace("。", "").replace(",", "").strip()) audio_path = DIR + path.strip() duration = get_duration_wave(audio_path) line = "{\"key\":\"" + audio_path + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join(ps) + "\"}" out_file.write(line + "\n") out_file.close()
def gen_label(): f = open(_data_path + "001_baidu.txt") # wfobj = codecs.open(_data_path + "001_baidu_2.txt", 'w', encoding="utf-8") wfobj = open(_data_path + "001_baidu_2.txt", 'w') for i, line in enumerate(f.readlines()): if not line.strip(): continue newi = generate_zi_label(deletePunc(line.strip())) wfobj.write(" ".join(newi) + "\n") if i % 10 == 0: wfobj.flush() wfobj.close()
def search_2_word(): tran = open("/export/aiplatform/search/transcript").readlines() out_file = open('resources/search.json', 'w') for t in tran: path, d, txt = t.split(" ", 2) ps = generate_zi_label(txt.strip()) audio_path = "/export/aiplatform/search/" + "wav/" + path duration = get_duration_wave(audio_path) line = "{\"key\":\"" + audio_path + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join(ps) + "\"}" out_file.write(line + "\n") out_file.close()
def aia_2_word(DIR): scp = [i for i in glob.glob(DIR + "/*/*.scp") if "noise" not in i] dir_name = DIR.rsplit("/", 1)[1] out_file = codecs.open(_data_path + 'fanlu/' + dir_name + '.json', 'w', encoding="utf-8") d = set() e = set() for i, line in enumerate(open("resources/unicodemap_zi.csv").readlines()): d.add(line.rsplit(",", 1)[0]) out_file2 = codecs.open(_data_path + 'fanlu/' + dir_name + '.miss.json', 'w', encoding="utf-8") for j in scp: for m, line in enumerate(open(j).readlines()): # print(line) file_name, txt = line.strip().split("\t", 1) path = "/export/fanlu/" + '16k/' + dir_name + "/" + j.rsplit( "/", 1)[1].replace(".scp", "") + "/" + file_name + ".wav" if not os.path.exists(path): print("%s not exist" % path) continue duration = get_duration_wave(path) if duration > 16: print("%s longer than 16s" % path) continue txt = strQ2B(txt.strip().decode("utf8")).encode("utf8") ps = generate_zi_label(deletePunc(txt)) if len(ps) == 0: continue line = "{\"key\":\"" + path.replace( "fanlu", "aiplatform") + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join( [p.decode("utf-8") for p in ps]) + "\"}" flag = False for p in ps: if p not in d: e.add(p) print("not in d is %s %s. %s" % (p, [p], "".join(ps))) flag = True break if flag: out_file2.write(line + "\n") else: out_file.write(line + "\n") out_file.close() out_file2.close() out_file1 = open(_data_path + 'fanlu/' + dir_name + '.miss', 'w') for i in e: out_file1.write(i + "\n") out_file1.close()
def check_biaozhu(): f = _data_path + "bdp1.txt" import json count = 0 all = 0 amount = 0 import codecs wfobj = codecs.open(_data_path + "bdp2.txt", 'w', encoding="utf-8") # f2 = open(_data_path + "bdp2.txt", "w") for i in open(f).readlines(): d = json.loads(i.strip()) manual = d.get("manual", "").encode("utf-8").replace(",", "").replace( "。", "").replace(",", "").replace(".", "") machine = d.get("machine", "").encode("utf-8").replace( ",", "").replace("。", "").replace(",", "").replace(".", "") if "A" not in manual and "B" not in manual and "C" not in manual and "D" not in manual and "E" not in manual: manuals = generate_zi_label(manual) machines = generate_zi_label(machine) l_distance = levenshtein_distance(manuals, machines) count += l_distance all += len(manuals) amount += 1 wav_file = "/export/aiplatform/data_label/task0/" + d.get( "name", "") duration = get_duration_wave(wav_file) if duration > 16: continue c = { "key": wav_file, "duration": str(duration), "text": " ".join([m.decode("utf-8") for m in manuals]) } # line = "{\"key\":\"" + wav_file + "\", \"duration\": " + str(1) + ", \"text\":\"" + " ".join([m.decode("utf-8") for m in manuals]) + "\"}" wfobj.write(json.dumps(c, ensure_ascii=False) + "\n") wfobj.close() print("amount: %d, error: %d, all: %d, cer: %.4f" % (amount, count, all, count / float(all)))
def ai_thchs30_2_word(): ori_wavs = glob.glob(_data_path + "/thchs30/data_thchs30/8k/data/*.wav") out_file = open(_data_path + "/thchs30/data_thchs30/8k/thchs30_data.json", 'w') for w in ori_wavs: path, name = w.rsplit("/", 1) rs = open(w.replace("8k", "") + ".trn").readlines()[0].strip() ps = generate_zi_label(rs) duration = get_duration_wave(w) line = "{\"key\":\"" + w + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join(ps) + "\"}" out_file.write(line + "\n") # for w2 in glob.glob(path.replace("data","data_aug")+"/" + name.split(".")[0] + "*.wav"): # audio = wave.open(w2) # duration = float(audio.getnframes()) / audio.getframerate() # audio.close() # line = "{\"key\":\"" + w2 + "\", \"duration\": " + str(duration) + ", \"text\":\"" + " ".join(ps) + "\"}" # out_file.write(line + "\n") # print(ps) out_file.close()
def ai_2_word(): lines = open( _data_path + "data_aishell/transcript/aishell_transcript_v0.8.txt").readlines() out_file = open(_data_path + "data_aishell/wav8000/aishell_train_8k.json", 'w') out_file1 = open( _data_path + "data_aishell/wav8000/aishell_validation_8k.json", 'w') out_file2 = open(_data_path + "data_aishell/wav8000/aishell_test_8k.json", 'w') for line in lines: rs = line.strip().split(" ") ps = generate_zi_label("".join(rs[1:])) if rs[0][6:11] <= "S0723": wav = _data_path + "data_aishell/wav8000/train/" + rs[0][ 6:11] + "/" + rs[0] + ".wav" # dir = _data_path + "data_aishell/wav/train_aug/" + rs[0][6:11] + "/" + rs[0] # for w in glob.glob(dir + "*.wav"): duration = get_duration_wave(wav) line = "{\"key\":\"" + wav + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join(ps) + "\"}" out_file.write(line + "\n") elif rs[0][6:11] <= "S0763": wav = _data_path + "data_aishell/wav8000/dev/" + rs[0][ 6:11] + "/" + rs[0] + ".wav" duration = get_duration_wave(wav) line = "{\"key\":\"" + wav + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join(ps) + "\"}" out_file1.write(line + "\n") else: wav = _data_path + "data_aishell/wav/test/" + rs[0][ 6:11] + "/" + rs[0] + ".wav" duration = get_duration_wave(wav) line = "{\"key\":\"" + wav + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join(ps) + "\"}" out_file2.write(line + "\n") out_file.close() out_file1.close() out_file2.close()
def deal_wave(): d = set() for i, line in enumerate(open("resources/unicodemap_zi.csv").readlines()): d.add(line.rsplit(",", 1)[0]) out_file = open('/export/fanlu/WAVE.json', 'w') script = glob.glob("/export/fanlu/SCRIPT/*.TXT") for s in script: name = s.rsplit("/", 1)[0][1:-5] wav_path = "/export/file_server/WAVE/SPEAKER" + name + "/" lines = open(s).readlines() for i in range(0, len(lines), 2): wav_name = lines[i].split("\t")[0] txt = filter(is_chinese, lines[i + 1].strip().decode("utf-8")) ps = generate_zi_label(txt) if len(ps) == 0: continue flag = False for p in ps: if p not in d or p.isdigit(): print("not in d is %s %s. %s" % (p, [p], "".join(ps))) flag = True break if flag: continue wav = wav_path + "SESSION0/" + wav_name + ".WAV" if not os.path.exists(wav): print("%s not exist" % wav) continue duration = get_duration_wave(wav) if duration > 16: print("%s longer than 16s" % wav) continue line = "{\"key\":\"" + wav.replace( "file_server", "aiplatform") + "\", \"duration\": " + str( duration) + ", \"text\":\"" + " ".join(ps) + "\"}" out_file.write(line + "\n")
'spd': random.randint(4, 6), # 语速,取值0-9,默认为5中语速 'pit': random.randint(4, 7), # 音调,取值0-9,默认为5中语调 'vol': random.randint(4, 7), # 音量,取值0-15,默认为5中音量 'per': random.randint( 0, 3) # 发音人选择, 0为女声,1为男声,3为情感合成-度逍遥,4为情感合成-度丫丫(不好),默认为普通女 } file1, d = line.strip().split(" ", 1) if file1 < 'BAC009S0208W0465': continue path = os.path.join( _data_path, "data_aishell/baidu/", file1[6:11], "%s_%s_%s_%s_%s.%s" % (file1, cfg.get("spd"), cfg.get("pit"), cfg.get("vol"), cfg.get("per"), "mp3")) gen_wav(d, cfg, path) AudioSegment.from_mp3(path).export(path[:-3] + "wav", format="wav") ps = generate_zi_label(d) lin = "{\"key\":\"" + path + "\", \"duration\": " + str( get_duration_wave(path[:-3] + "wav")) + ", \"text\":\"" + " ".join( ps).decode("utf-8") + "\"}" out_file2.write(lin + "\n") if (i + 1) % 100 == 0: print(i) out_file2.flush() out_file2.close() # 从URL获取文件识别 # aipSpeech.asr('', 'pcm', 16000, { # 'url': 'http://121.40.195.233/res/16k_test.pcm', # 'callback': 'http://xxx.com/receive', # })
def prepare_minibatch_fbank(self, audio_paths, texts, overwrite=False, is_bi_graphemes=False, seq_length=-1, save_feature_as_csvfile=False, language="en", zh_type="zi", noise_percent=0.4): """ Featurize a minibatch of audio, zero pad them and return a dictionary Params: audio_paths (list(str)): List of paths to audio files texts (list(str)): List of texts corresponding to the audio files Returns: dict: See below for contents """ assert len(audio_paths) == len(texts), \ "Inputs and outputs to the network must be of the same number" # Features is a list of (timesteps, feature_dim(161)) arrays (channel(3), feature_dim(41), timesteps) # Calculate the features for each audio clip, as the log of the # Fourier Transform of the audio features = [ self.featurize_fbank( a, overwrite=overwrite, save_feature_as_csvfile=save_feature_as_csvfile, noise_percent=noise_percent, seq_length=seq_length) for a in audio_paths ] input_lengths = [f.shape[1] for f in features] channel, timesteps, feature_dim = features[0].shape mb_size = len(features) # Pad all the inputs so that they are all the same length if seq_length == -1: x = np.zeros((mb_size, channel, self.max_seq_length, feature_dim)) else: x = np.zeros((mb_size, channel, seq_length, feature_dim)) y = np.zeros((mb_size, self.max_label_length)) labelUtil = LabelUtil() label_lengths = [] for i in range(mb_size): feat = features[i] feat = self.normalize_fbank(feat) # Center using means and std x[i, :, :feat. shape[1], :] = feat # padding with 0 padding with noise? if language == "en" and is_bi_graphemes: label = generate_bi_graphemes_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label elif language == "en" and not is_bi_graphemes: label = labelUtil.convert_word_to_num(texts[i]) y[i, :len(texts[i])] = label elif language == "zh" and zh_type == "phone": label = generate_phone_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label elif language == "zh" and zh_type == "py": label = generate_py_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label elif language == "zh" and zh_type == "zi": label = generate_zi_label(texts[i]) label = labelUtil.convert_bi_graphemes_to_num(label) y[i, :len(label)] = label label_lengths.append(len(label)) return { 'x': x, # (0-padded features of shape(mb_size,timesteps,feat_dim) 'y': y, # list(int) Flattened labels (integer sequences) 'texts': texts, # list(str) Original texts 'input_lengths': input_lengths, # list(int) Length of each input 'label_lengths': label_lengths, # list(int) Length of each label }