예제 #1
0
def main(config, random_seed, dist, apply_normalization, n_pad):
    """
    构建 IRM(Ideal ratio mask)语音增强数据集
    数据集为语句级别,带噪语音和它相应纯净语音的频谱尺寸相同

    Steps:
        1. 加载纯净语音信号
        2. 加载噪声文件
        3. 在纯净语音信号上叠加噪声信号
        4. 计算频谱,mask等
        5. 分别存储带噪语音的频谱与 mask

    Args:
        config (dict): 配置信息
        random_seed (int): 随机种子
        dist (str): 输出结果的目录
        apply_normalization (bool): 是否对 mixture 语音进行规范化
        n_pad (int): mixture 语音中帧的拓展范围,拓展后中心帧对应 mask 中的一帧

    Dataset:
        dataset_1/
            mixture.npy
            mask.npy
        ...

        mixture.npy is {
            "0001_babble_-5": (257, T * (n_pad * 2 + 1)),
            "0001_babble_-10": (257, T * T * (n_pad * 2 + 1))
            ...
        }

        mask.npy is {
            "0001_babble_-5": (257, T),
            "0001_babble_-10": (257, T),
            ...
        }
    """
    global clean_lps
    np.random.seed(random_seed)
    dist_dir = Path(dist)

    # 以遍历的方式读取 config.json 中各个数据集的配置项
    for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
        dataset_dir = dist_dir / dataset_cfg["name"]
        prepare_empty_dirs([dataset_dir])
        print("=" * 12 +
              f"Building set {dataset_itx}: {dataset_cfg['name']} set" +
              "=" * 12)

        # 加载纯净语音信号,存至 list 中
        clean_cfg = dataset_cfg["clean"]
        clean_speech_paths = librosa.util.find_files(
            directory=clean_cfg["database"],
            ext=clean_cfg["ext"],
            recurse=clean_cfg["recurse"],
            limit=clean_cfg["limit"],
            offset=clean_cfg["offset"])
        random.shuffle(clean_speech_paths)
        clean_ys = load_wavs(
            file_paths=clean_speech_paths,
            sr=clean_cfg["sampling_rate"],
            min_sampling=clean_cfg["min_sampling"],
        )
        print("Loaded clean speeches.")

        # 加载噪声信号,存至 dict 中
        noise_cfg = dataset_cfg["noise"]
        noise_database_dir = Path(noise_cfg["database"])
        noise_ys = {}
        for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
            mixture, _ = librosa.load(
                (noise_database_dir / (noise_type + ".wav")).as_posix(),
                sr=noise_cfg["sampling_rate"])
            noise_ys[noise_type] = mixture
        print("Loaded noise.")

        # 合成带噪语音
        mixture_store = {}
        mask_store = {}
        for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
            num = str(i).zfill(4)
            for snr in dataset_cfg["snr"]:
                for noise_type in noise_ys.keys():
                    basename_text = f"{num}_{noise_type}_{snr}"

                    clean, noise = corrected_the_length_of_noise_and_clean_speech(
                        clean_y=clean, noise_y=noise_ys[noise_type])

                    mixture = add_noise_for_waveform(clean, noise, int(snr))

                    mixture_mag = mag(mixture)
                    clean_mag = mag(clean)
                    noise_mag = mag(noise)

                    if apply_normalization:
                        mixture_mag = input_normalization(mixture_mag)

                    mixture_mag = unfold_spectrum(mixture_mag, n_pad=n_pad)
                    mask = noise_mag / (noise_mag + clean_mag)

                    assert mixture_mag.shape[0] == mask.shape[0] == 257
                    mixture_store[basename_text] = mixture_mag
                    mask_store[basename_text] = mask

        print(f"Synthesize finished,storing file...")
        joblib.dump(mask_store, (dataset_dir / "mask.pkl").as_posix())
        joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())
예제 #2
0
def main(config, random_seed, dist, n_pad):
    """
    构建*频域*上的语音增强数据集(Log Power Spectrum)
    每句带噪语音的时间步上都包含多帧,多帧的中心帧对应这个时间步上的一帧纯净语音
    中心帧前面的时间帧:
    中心帧后面的时间帧:
    TODO 文档等待进一步更新

    Steps:
        1. 加载纯净语音信号
        2. 加载噪声文件
        3. 在纯净语音信号上叠加噪声信号
        4. 分别计算 LPS 特征
        5. 将带噪语音的 LPS 特征进行拓展
        5. 分别存储带噪语音与纯净语音

    Args:
        config (dict): 配置信息
        random_seed (int): 随机种子
        dist (str): 输出结果的目录
        n_pad (int): 带噪语音的拓展大小

    Dataset:
        dataset_1/
            mixture.npy
            clean.npy
        ...

        mixture.npy is {
            "0001_babble_-5": (257 * 3 * , T),
            "0001_babble_-10": (257 * 3, T),
            ...
        }

        clean.npy is {
            "0001": (257, T),
            "0002": (257, T),
            ...
        }
    """
    global clean_lps
    np.random.seed(random_seed)
    dist_dir = Path(dist)

    # 以遍历的方式读取 config.json 中各个数据集的配置项
    for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
        dataset_dir = dist_dir / dataset_cfg["name"]
        prepare_empty_dirs([dataset_dir])
        print("=" * 12 +
              f"Building set {dataset_itx}: {dataset_cfg['name']} set" +
              "=" * 12)

        # 加载纯净语音信号,存至 list 中
        clean_cfg = dataset_cfg["clean"]
        clean_speech_paths = librosa.util.find_files(
            directory=clean_cfg["database"],
            ext=clean_cfg["ext"],
            recurse=clean_cfg["recurse"],
            limit=clean_cfg["limit"],
            offset=clean_cfg["offset"])
        random.shuffle(clean_speech_paths)
        clean_ys = load_wavs(
            file_paths=clean_speech_paths,
            sr=clean_cfg["sampling_rate"],
            min_sampling=clean_cfg["min_sampling"],
        )
        print("Loaded clean speeches.")

        # 加载噪声信号,存至 dict 中
        noise_cfg = dataset_cfg["noise"]
        noise_database_dir = Path(noise_cfg["database"])
        noise_ys = {}
        for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
            mixture, _ = librosa.load(
                (noise_database_dir / (noise_type + ".wav")).as_posix(),
                sr=noise_cfg["sampling_rate"])
            noise_ys[noise_type] = mixture
        print("Loaded noise.")

        # 合成带噪语音
        mixture_store = {}
        clean_store = {}
        for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
            num = str(i).zfill(4)
            for snr in dataset_cfg["snr"]:
                for noise_type in noise_ys.keys():
                    basename_text = f"{num}_{noise_type}_{snr}"

                    clean, noise = corrected_the_length_of_noise_and_clean_speech(
                        clean_y=clean, noise_y=noise_ys[noise_type])

                    mixture = add_noise_for_waveform(clean, noise, int(snr))
                    assert len(mixture) == len(clean) == len(noise)

                    mixture_lps = lps(mixture)
                    clean_lps = lps(clean)
                    mixture_lps = unfold_spectrum(mixture_lps, n_pad=n_pad)

                    assert mixture_lps.shape[0] == clean_lps.shape[0] == 257
                    mixture_store[basename_text] = mixture_lps

            clean_store[num] = clean_lps

        print(f"Synthesize finished,storing file...")
        joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix())
        joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())
예제 #3
0
def main(config, random_seed, dist):
    """
    构建时域上的语音增强数据集

    Steps:
        1. 加载纯净语音信号
        2. 加载噪声文件
        3. 在纯净语音信号上叠加噪声信号
        4. 分别存储带噪语音与纯净语音

    Args:
        config (dict): 配置信息
        random_seed (int): 随机种子
        dist (str): 输出结果的目录

    Dataset:
        dataset_1/
            mixture.npy
            clean.npy
        ...

        mixture.npy is {
            "0001_babble_-5": [signals, ...],
            "0001_babble_-10": [signals, ...],
            ...
        }

        clean.npy is {
            "0001": [signals, ...],
            "0002": [signals, ...],
            ...
        }
    """
    np.random.seed(random_seed)
    dist_dir = Path(dist)

    # 以遍历的方式读取 config.json 中各个数据集的配置项
    for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
        dataset_dir = dist_dir / dataset_cfg["name"]
        prepare_empty_dirs([dataset_dir, dataset_dir / "Clean", dataset_dir / "Noisy"])
        print("=" * 12 + f"Building set {dataset_itx}: {dataset_cfg['name']} set" + "=" * 12)

        # 加载纯净语音信号,存至 list 中
        clean_cfg = dataset_cfg["clean"]
        clean_speech_paths = librosa.util.find_files(
            directory=clean_cfg["database"],
            ext=clean_cfg["ext"],
            recurse=clean_cfg["recurse"],
            limit=clean_cfg["limit"],
            offset=clean_cfg["offset"]
        )
        random.shuffle(clean_speech_paths)
        clean_ys = load_wavs(
            file_paths=clean_speech_paths,
            sr=clean_cfg["sampling_rate"],
            min_sampling=clean_cfg["min_sampling"],
        )
        print("Loaded clean speeches.")

        # 加载噪声信号,存至 dict 中
        noise_cfg = dataset_cfg["noise"]
        noise_database_dir = Path(noise_cfg["database"])
        noise_ys = {}
        for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
            mixture, _ = librosa.load(
                (noise_database_dir / (noise_type + ".wav")).as_posix(),
                sr=noise_cfg["sampling_rate"])
            noise_ys[noise_type] = mixture
        print("Loaded noise.")

        # 合成带噪语音
        n = 0
        for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
            for snr in dataset_cfg["snr"]:
                for noise_type in noise_ys.keys():
                    clean, noise = corrected_the_length_of_noise_and_clean_speech(
                        clean_y=clean,
                        noise_y=noise_ys[noise_type]
                    )

                    mixture = add_noise_for_waveform(clean, noise, int(snr))
                    assert len(mixture) == len(clean) == len(noise)

                    fname = f"{dataset_cfg['name']}_{n}.wav"
                    librosa.output.write_wav((dataset_dir / "Clean" / fname).as_posix(), clean, sr=16000)
                    librosa.output.write_wav((dataset_dir / "Noisy" / fname).as_posix(), mixture, sr=16000)
                    n += 1
예제 #4
0
def main(config):
    OUTPUT_DIR = Path(config["output_dir"])
    SAMPLING_RATE = config["sampling_rate"]

    for j, dataset_cfg in enumerate(config["datasets"]):
        print(f"============ Building set {j + 1}: {dataset_cfg['name']} set ============")
        dataset_dir: Path = OUTPUT_DIR / dataset_cfg["name"]
        prepare_empty_dirs([dataset_dir])

        """============ clean speeches ============"""
        clean_meta = dataset_cfg["clean"]
        clean_speech_paths = librosa.util.find_files(
            directory=clean_meta["database"],
            ext=clean_meta["ext"],
            recurse=clean_meta["recurse"],
            limit=None,
            offset=clean_meta["offset"]
        )
        random.shuffle(clean_speech_paths)

        # 加载纯净语音时可以指定 minimum_sampling 参数,控制加载语音需要满足的最小采样点数
        # 但在加载噪声时则没有这个参数。如果在合成带噪语音阶段发现噪声长度小于语音长度,则将噪声复制多次再合成带噪语音。
        clean_ys = load_wavs(
            file_paths=clean_speech_paths,
            limit=clean_meta["limit"],
            sr=SAMPLING_RATE,
            minimum_sampling=clean_meta["minimum_sampling"],
        )
        print("Loaded clean speeches.")

        """============ noise speeches ============"""
        noise_meta = dataset_cfg["noise"]
        noise_database_dir = Path(noise_meta["database"])
        noise_ys = {}
        for noise_type in tqdm(noise_meta["types"], desc="Loading noise files"):
            noise_y, _ = librosa.load((noise_database_dir / (noise_type + ".wav")).as_posix(), sr=SAMPLING_RATE)
            noise_ys[noise_type] = noise_y

        print("Loaded noise.")

        """============ 合成 ============"""
        # 带噪
        for i, SNR in enumerate(dataset_cfg["SNRs"]):
            store = {}
            clean_store = {}
            for j, clean_y in tqdm(enumerate(clean_ys, 1), desc="Add noise for clean waveform"):
                for noise_type in noise_ys.keys():
                    output_wav_basename_text = f"{str(j).zfill(4)}_{noise_type}"
                    clean_y, noise_y = corrected_length(
                        clean_y=clean_y,
                        noise_y=noise_ys[noise_type]
                    )

                    noisy_y = add_noise_for_waveform(clean_y, noise_y, int(SNR))

                    assert len(noisy_y) == len(clean_y) == len(noise_y)

                    """
                    SNR == -5 是整个模型的输入,使用 7 帧
                    剩余的信噪比和纯净语音为模型训练的目标,使用 1 帧
                    """
                    if SNR == -5:
                        tmp_lps = torch.Tensor(lps(noisy_y, pad=3).T).unfold(0, 7, 1)
                        store[output_wav_basename_text] = tmp_lps.reshape(tmp_lps.shape[0], -1).numpy()
                    else:
                        store[output_wav_basename_text] = lps(noisy_y).T

                    if i == 0:
                        clean_store[output_wav_basename_text] = lps(clean_y).T

            print(f"Synthesize dB{SNR} finished,storing NPY file...")
            if clean_store:
                print("Saving clean NPY file...")
                np.save((dataset_dir / "clean.npy").as_posix(), clean_store)

            np.save((dataset_dir / f"dB{SNR}.npy").as_posix(), store)
예제 #5
0
def main(config, random_seed, dist):
    """
    构建时域上的语音增强数据集

    Steps:
        1. 加载纯净语音信号
        2. 加载噪声文件
        3. 在纯净语音信号上叠加噪声信号
        4. 分别存储带噪语音与纯净语音

    Args:
        config (dict): 配置信息
        random_seed (int): 随机种子
        dist (str): 输出结果的目录

    Dataset:
        dataset_1/
            mixture.npy
            clean.npy
        ...

        mixture.npy is {
            "0001_babble_-5": [signals, ...],
            "0001_babble_-10": [signals, ...],
            ...
        }

        clean.npy is {
            "0001": [signals, ...],
            "0002": [signals, ...],
            ...
        }
    """
    np.random.seed(random_seed)
    dist_dir = Path(dist)

    # 以遍历的方式读取 config.json 中各个数据集的配置项
    for dataset_itx, dataset_cfg in enumerate(config["dataset"], start=1):
        dataset_dir = dist_dir / dataset_cfg["name"]
        prepare_empty_dirs([dataset_dir])
        print("=" * 12 +
              f"Building set {dataset_itx}: {dataset_cfg['name']} set" +
              "=" * 12)

        # 加载纯净语音信号,存至 list 中
        clean_cfg = dataset_cfg["clean"]
        clean_speech_paths = librosa.util.find_files(
            directory=clean_cfg["database"],
            ext=clean_cfg["ext"],
            recurse=clean_cfg["recurse"],
            limit=clean_cfg["limit"],
            offset=clean_cfg["offset"])
        random.shuffle(clean_speech_paths)
        clean_ys = load_wavs(
            file_paths=clean_speech_paths,
            sr=clean_cfg["sampling_rate"],
            min_sampling=clean_cfg["min_sampling"],
        )
        print("Loaded clean speeches.")

        # 加载噪声信号,存至 dict 中
        noise_cfg = dataset_cfg["noise"]
        noise_database_dir = Path(noise_cfg["database"])
        noise_ys = {}
        for noise_type in tqdm(noise_cfg["types"], desc="Loading noise files"):
            mixture, _ = librosa.load(
                (noise_database_dir / (noise_type + ".wav")).as_posix(),
                sr=noise_cfg["sampling_rate"])
            noise_ys[noise_type] = mixture
        print("Loaded noise.")

        # 合成带噪语音
        mixture_store = {}
        clean_store = {}
        for i, clean in tqdm(enumerate(clean_ys, start=1), desc="合成带噪语音"):
            num = str(i).zfill(4)
            for snr in dataset_cfg["snr"]:
                for noise_type in noise_ys.keys():
                    basename_text = f"{num}_{noise_type}_{snr}"

                    clean, noise = corrected_the_length_of_noise_and_clean_speech(
                        clean_y=clean, noise_y=noise_ys[noise_type])

                    mixture = add_noise_for_waveform(clean, noise, int(snr))
                    assert len(mixture) == len(clean) == len(noise)

                    mixture_store[basename_text] = mixture

            # 基于一条纯净语音可以合成多种类型的带噪语音,但仅存储一份纯净语音
            clean_store[num] = clean

        print(f"Synthesize finished,storing file...")
        joblib.dump(clean_store, (dataset_dir / "clean.pkl").as_posix())
        joblib.dump(mixture_store, (dataset_dir / "mixture.pkl").as_posix())