예제 #1
0
def train_2_baidu():
    out_file2 = codecs.open(_data_path + "/wav/S0002.txt",
                            'w',
                            encoding="utf-8")
    for i, line in enumerate(
            open(_data_path +
                 "/resource_aishell/aishell_transcript_v0.8.txt").readlines()):
        # cfg = {
        #     'spd': random.randint(4, 6),  # 语速,取值0-9,默认为5中语速
        #     'pit': random.randint(4, 7),  # 音调,取值0-9,默认为5中语调
        #     'vol': random.randint(4, 7),  # 音量,取值0-15,默认为5中音量
        #     'per': random.randint(0, 3)  # 发音人选择, 0为女声,1为男声,3为情感合成-度逍遥,4为情感合成-度丫丫(不好),默认为普通女
        # }
        file1, d = line.strip().split(" ", 1)
        if file1 >= "BAC009S0003W0121":
            continue
        cfg = {'spd': 5, 'pit': 5, 'vol': 5, 'per': 0}
        # d = json.loads(line.strip())
        path = os.path.join(
            _data_path, "wav", file1[6:11],
            "%s_%s_%s_%s_%s.%s" % (file1, cfg.get("spd"), cfg.get("pit"),
                                   cfg.get("vol"), cfg.get("per"), "mp3"))
        gen_wav(d, cfg, path)
        AudioSegment.from_mp3(path).export(path[:-3] + "wav", format="wav")
        ps = generate_zi_label(d)
        lin = "{\"key\":\"" + path[:-3] + "wav" + "\", \"duration\": " + str(
            get_duration_wave(path[:-3] + "wav")) + ", \"text\":\"" + " ".join(
                ps).decode("utf-8") + "\"}"
        out_file2.write(lin + "\n")
        if (i + 1) % 100 == 0:
            out_file2.flush()
    out_file2.close()
예제 #2
0
def xiaoshuo_2_word():
    d = set()
    for i, line in enumerate(open("resources/unicodemap_zi.csv").readlines()):
        d.add(line.rsplit(",", 1)[0])

    DIR = "/export/aiplatform/8k/"
    out_file = open(DIR + 'resulttxtnew26.json', 'w')
    for i in glob.glob(DIR + "resulttxtnew26/*/*.wav"):
        txt = "".join([
            line.strip()
            for line in open(i.replace("8k/", "")[:-3] + "txt").readlines()
        ])
        txt = strQ2B(txt.strip().decode("utf8")).encode("utf8")
        ps = generate_zi_label(deletePunc(txt))
        if len(ps) == 0:
            continue
        flag = False
        for p in ps:
            if p not in d or p.isdigit():
                print("not in d is %s %s. %s" % (p, [p], "".join(ps)))
                flag = True
                break
        if flag:
            continue
        duration = get_duration_wave(i)
        if duration > 16:
            continue
        line = "{\"key\":\"" + i + "\", \"duration\": " + str(
            duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
        out_file.write(line + "\n")
    out_file.close()
예제 #3
0
def client_2_word():
    DIR = "/export/aiplatform/client_files4/"

    def compare(x, y):
        stat_x = os.stat(DIR + "/" + x)
        stat_y = os.stat(DIR + "/" + y)
        if stat_x.st_ctime < stat_y.st_ctime:
            return -1
        elif stat_x.st_ctime > stat_y.st_ctime:
            return 1
        else:
            return 0

    # iterms = os.listdir(DIR)

    # iterms.sort(compare)

    # for iterm in iterms:
    #    print(iterm)
    wavs = open(DIR + 'wav.txt').readlines()
    labels = open(DIR + 'label.txt').readlines()
    out_file = open(DIR + 'client4.json', 'w')
    for i, (path, txt) in enumerate(zip(wavs, labels)):
        ps = generate_zi_label(
            txt.replace(",", "").replace("。", "").replace(",", "").strip())
        audio_path = DIR + path.strip()
        duration = get_duration_wave(audio_path)
        line = "{\"key\":\"" + audio_path + "\", \"duration\": " + str(
            duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
        out_file.write(line + "\n")
    out_file.close()
예제 #4
0
def gen_label():
    f = open(_data_path + "001_baidu.txt")
    # wfobj = codecs.open(_data_path + "001_baidu_2.txt", 'w', encoding="utf-8")
    wfobj = open(_data_path + "001_baidu_2.txt", 'w')
    for i, line in enumerate(f.readlines()):
        if not line.strip():
            continue
        newi = generate_zi_label(deletePunc(line.strip()))
        wfobj.write(" ".join(newi) + "\n")
        if i % 10 == 0:
            wfobj.flush()
    wfobj.close()
예제 #5
0
def search_2_word():
    tran = open("/export/aiplatform/search/transcript").readlines()
    out_file = open('resources/search.json', 'w')
    for t in tran:
        path, d, txt = t.split(" ", 2)
        ps = generate_zi_label(txt.strip())
        audio_path = "/export/aiplatform/search/" + "wav/" + path
        duration = get_duration_wave(audio_path)
        line = "{\"key\":\"" + audio_path + "\", \"duration\": " + str(
            duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
        out_file.write(line + "\n")
    out_file.close()
예제 #6
0
def aia_2_word(DIR):
    scp = [i for i in glob.glob(DIR + "/*/*.scp") if "noise" not in i]
    dir_name = DIR.rsplit("/", 1)[1]
    out_file = codecs.open(_data_path + 'fanlu/' + dir_name + '.json',
                           'w',
                           encoding="utf-8")
    d = set()
    e = set()
    for i, line in enumerate(open("resources/unicodemap_zi.csv").readlines()):
        d.add(line.rsplit(",", 1)[0])
    out_file2 = codecs.open(_data_path + 'fanlu/' + dir_name + '.miss.json',
                            'w',
                            encoding="utf-8")
    for j in scp:
        for m, line in enumerate(open(j).readlines()):
            # print(line)
            file_name, txt = line.strip().split("\t", 1)
            path = "/export/fanlu/" + '16k/' + dir_name + "/" + j.rsplit(
                "/", 1)[1].replace(".scp", "") + "/" + file_name + ".wav"
            if not os.path.exists(path):
                print("%s not exist" % path)
                continue
            duration = get_duration_wave(path)
            if duration > 16:
                print("%s longer than 16s" % path)
                continue
            txt = strQ2B(txt.strip().decode("utf8")).encode("utf8")
            ps = generate_zi_label(deletePunc(txt))
            if len(ps) == 0:
                continue
            line = "{\"key\":\"" + path.replace(
                "fanlu", "aiplatform") + "\", \"duration\": " + str(
                    duration) + ", \"text\":\"" + " ".join(
                        [p.decode("utf-8") for p in ps]) + "\"}"
            flag = False
            for p in ps:
                if p not in d:
                    e.add(p)
                    print("not in d is %s %s. %s" % (p, [p], "".join(ps)))
                    flag = True
                    break
            if flag:
                out_file2.write(line + "\n")
            else:
                out_file.write(line + "\n")
    out_file.close()
    out_file2.close()
    out_file1 = open(_data_path + 'fanlu/' + dir_name + '.miss', 'w')
    for i in e:
        out_file1.write(i + "\n")
    out_file1.close()
예제 #7
0
def check_biaozhu():
    f = _data_path + "bdp1.txt"
    import json
    count = 0
    all = 0
    amount = 0
    import codecs
    wfobj = codecs.open(_data_path + "bdp2.txt", 'w', encoding="utf-8")
    # f2 = open(_data_path + "bdp2.txt", "w")
    for i in open(f).readlines():
        d = json.loads(i.strip())
        manual = d.get("manual", "").encode("utf-8").replace(",", "").replace(
            "。", "").replace(",", "").replace(".", "")
        machine = d.get("machine", "").encode("utf-8").replace(
            ",", "").replace("。", "").replace(",", "").replace(".", "")
        if "A" not in manual and "B" not in manual and "C" not in manual and "D" not in manual and "E" not in manual:
            manuals = generate_zi_label(manual)
            machines = generate_zi_label(machine)
            l_distance = levenshtein_distance(manuals, machines)
            count += l_distance
            all += len(manuals)
            amount += 1
            wav_file = "/export/aiplatform/data_label/task0/" + d.get(
                "name", "")
            duration = get_duration_wave(wav_file)
            if duration > 16:
                continue
            c = {
                "key": wav_file,
                "duration": str(duration),
                "text": " ".join([m.decode("utf-8") for m in manuals])
            }
            # line = "{\"key\":\"" + wav_file + "\", \"duration\": " + str(1) + ", \"text\":\"" + " ".join([m.decode("utf-8") for m in manuals]) + "\"}"
            wfobj.write(json.dumps(c, ensure_ascii=False) + "\n")
    wfobj.close()
    print("amount: %d, error: %d, all: %d, cer: %.4f" %
          (amount, count, all, count / float(all)))
예제 #8
0
def ai_thchs30_2_word():
    ori_wavs = glob.glob(_data_path + "/thchs30/data_thchs30/8k/data/*.wav")
    out_file = open(_data_path + "/thchs30/data_thchs30/8k/thchs30_data.json",
                    'w')
    for w in ori_wavs:
        path, name = w.rsplit("/", 1)
        rs = open(w.replace("8k", "") + ".trn").readlines()[0].strip()
        ps = generate_zi_label(rs)
        duration = get_duration_wave(w)
        line = "{\"key\":\"" + w + "\", \"duration\": " + str(
            duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
        out_file.write(line + "\n")
        # for w2 in glob.glob(path.replace("data","data_aug")+"/" + name.split(".")[0] + "*.wav"):
        #  audio = wave.open(w2)
        #  duration = float(audio.getnframes()) / audio.getframerate()
        #  audio.close()
        #  line = "{\"key\":\"" + w2 + "\", \"duration\": " + str(duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
        #  out_file.write(line + "\n")
        # print(ps)
    out_file.close()
예제 #9
0
def ai_2_word():
    lines = open(
        _data_path +
        "data_aishell/transcript/aishell_transcript_v0.8.txt").readlines()
    out_file = open(_data_path + "data_aishell/wav8000/aishell_train_8k.json",
                    'w')
    out_file1 = open(
        _data_path + "data_aishell/wav8000/aishell_validation_8k.json", 'w')
    out_file2 = open(_data_path + "data_aishell/wav8000/aishell_test_8k.json",
                     'w')
    for line in lines:
        rs = line.strip().split(" ")
        ps = generate_zi_label("".join(rs[1:]))
        if rs[0][6:11] <= "S0723":
            wav = _data_path + "data_aishell/wav8000/train/" + rs[0][
                6:11] + "/" + rs[0] + ".wav"
            # dir = _data_path + "data_aishell/wav/train_aug/" + rs[0][6:11] + "/" + rs[0]
            # for w in glob.glob(dir + "*.wav"):
            duration = get_duration_wave(wav)
            line = "{\"key\":\"" + wav + "\", \"duration\": " + str(
                duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
            out_file.write(line + "\n")
        elif rs[0][6:11] <= "S0763":
            wav = _data_path + "data_aishell/wav8000/dev/" + rs[0][
                6:11] + "/" + rs[0] + ".wav"
            duration = get_duration_wave(wav)
            line = "{\"key\":\"" + wav + "\", \"duration\": " + str(
                duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
            out_file1.write(line + "\n")
        else:
            wav = _data_path + "data_aishell/wav/test/" + rs[0][
                6:11] + "/" + rs[0] + ".wav"
            duration = get_duration_wave(wav)
            line = "{\"key\":\"" + wav + "\", \"duration\": " + str(
                duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
            out_file2.write(line + "\n")
    out_file.close()
    out_file1.close()
    out_file2.close()
예제 #10
0
def deal_wave():
    d = set()
    for i, line in enumerate(open("resources/unicodemap_zi.csv").readlines()):
        d.add(line.rsplit(",", 1)[0])
    out_file = open('/export/fanlu/WAVE.json', 'w')
    script = glob.glob("/export/fanlu/SCRIPT/*.TXT")
    for s in script:
        name = s.rsplit("/", 1)[0][1:-5]
        wav_path = "/export/file_server/WAVE/SPEAKER" + name + "/"
        lines = open(s).readlines()
        for i in range(0, len(lines), 2):
            wav_name = lines[i].split("\t")[0]
            txt = filter(is_chinese, lines[i + 1].strip().decode("utf-8"))
            ps = generate_zi_label(txt)
            if len(ps) == 0:
                continue
            flag = False
            for p in ps:
                if p not in d or p.isdigit():
                    print("not in d is %s %s. %s" % (p, [p], "".join(ps)))
                    flag = True
                    break
            if flag:
                continue
            wav = wav_path + "SESSION0/" + wav_name + ".WAV"
            if not os.path.exists(wav):
                print("%s not exist" % wav)
                continue
            duration = get_duration_wave(wav)
            if duration > 16:
                print("%s longer than 16s" % wav)
                continue
            line = "{\"key\":\"" + wav.replace(
                "file_server", "aiplatform") + "\", \"duration\": " + str(
                    duration) + ", \"text\":\"" + " ".join(ps) + "\"}"
            out_file.write(line + "\n")
예제 #11
0
            'spd': random.randint(4, 6),  # 语速,取值0-9,默认为5中语速
            'pit': random.randint(4, 7),  # 音调,取值0-9,默认为5中语调
            'vol': random.randint(4, 7),  # 音量,取值0-15,默认为5中音量
            'per': random.randint(
                0, 3)  # 发音人选择, 0为女声,1为男声,3为情感合成-度逍遥,4为情感合成-度丫丫(不好),默认为普通女
        }
        file1, d = line.strip().split(" ", 1)
        if file1 < 'BAC009S0208W0465':
            continue
        path = os.path.join(
            _data_path, "data_aishell/baidu/", file1[6:11],
            "%s_%s_%s_%s_%s.%s" % (file1, cfg.get("spd"), cfg.get("pit"),
                                   cfg.get("vol"), cfg.get("per"), "mp3"))
        gen_wav(d, cfg, path)
        AudioSegment.from_mp3(path).export(path[:-3] + "wav", format="wav")
        ps = generate_zi_label(d)
        lin = "{\"key\":\"" + path + "\", \"duration\": " + str(
            get_duration_wave(path[:-3] + "wav")) + ", \"text\":\"" + " ".join(
                ps).decode("utf-8") + "\"}"
        out_file2.write(lin + "\n")
        if (i + 1) % 100 == 0:
            print(i)
            out_file2.flush()
    out_file2.close()

    # 从URL获取文件识别
    # aipSpeech.asr('', 'pcm', 16000, {
    #     'url': 'http://121.40.195.233/res/16k_test.pcm',
    #     'callback': 'http://xxx.com/receive',
    # })
예제 #12
0
 def prepare_minibatch_fbank(self,
                             audio_paths,
                             texts,
                             overwrite=False,
                             is_bi_graphemes=False,
                             seq_length=-1,
                             save_feature_as_csvfile=False,
                             language="en",
                             zh_type="zi",
                             noise_percent=0.4):
     """ Featurize a minibatch of audio, zero pad them and return a dictionary
     Params:
         audio_paths (list(str)): List of paths to audio files
         texts (list(str)): List of texts corresponding to the audio files
     Returns:
         dict: See below for contents
     """
     assert len(audio_paths) == len(texts), \
         "Inputs and outputs to the network must be of the same number"
     # Features is a list of (timesteps, feature_dim(161)) arrays (channel(3), feature_dim(41), timesteps)
     # Calculate the features for each audio clip, as the log of the
     # Fourier Transform of the audio
     features = [
         self.featurize_fbank(
             a,
             overwrite=overwrite,
             save_feature_as_csvfile=save_feature_as_csvfile,
             noise_percent=noise_percent,
             seq_length=seq_length) for a in audio_paths
     ]
     input_lengths = [f.shape[1] for f in features]
     channel, timesteps, feature_dim = features[0].shape
     mb_size = len(features)
     # Pad all the inputs so that they are all the same length
     if seq_length == -1:
         x = np.zeros((mb_size, channel, self.max_seq_length, feature_dim))
     else:
         x = np.zeros((mb_size, channel, seq_length, feature_dim))
     y = np.zeros((mb_size, self.max_label_length))
     labelUtil = LabelUtil()
     label_lengths = []
     for i in range(mb_size):
         feat = features[i]
         feat = self.normalize_fbank(feat)  # Center using means and std
         x[i, :, :feat.
           shape[1], :] = feat  # padding with 0 padding with noise?
         if language == "en" and is_bi_graphemes:
             label = generate_bi_graphemes_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         elif language == "en" and not is_bi_graphemes:
             label = labelUtil.convert_word_to_num(texts[i])
             y[i, :len(texts[i])] = label
         elif language == "zh" and zh_type == "phone":
             label = generate_phone_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         elif language == "zh" and zh_type == "py":
             label = generate_py_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         elif language == "zh" and zh_type == "zi":
             label = generate_zi_label(texts[i])
             label = labelUtil.convert_bi_graphemes_to_num(label)
             y[i, :len(label)] = label
         label_lengths.append(len(label))
     return {
         'x': x,  # (0-padded features of shape(mb_size,timesteps,feat_dim)
         'y': y,  # list(int) Flattened labels (integer sequences)
         'texts': texts,  # list(str) Original texts
         'input_lengths': input_lengths,  # list(int) Length of each input
         'label_lengths': label_lengths,  # list(int) Length of each label
     }