def test_drop_freq(): from glob import glob for filename in glob(os.path.join(output_folder, "save", "*.flac")): expected_file = filename.replace("results", "expected") actual = read_audio(filename) expected = read_audio(expected_file) assert actual.allclose(expected)
def create_json(wav_list, json_file): """ Creates the json file given a list of wav files. Arguments --------- wav_list : list of str The list of wav files. json_file : str The path of the output json file """ # Processing all the wav files in the list json_dict = {} for wav_file in wav_list: # Reading the signal (to retrieve duration in seconds) signal = read_audio(wav_file) duration = signal.shape[0] / SAMPLERATE # Manipulate path to get relative path and uttid path_parts = wav_file.split(os.path.sep) uttid, _ = os.path.splitext(path_parts[-1]) relative_path = os.path.join("{data_root}", *path_parts[-5:]) # Create entry for this utterance json_dict[uttid] = {"wav": relative_path, "length": duration} # Writing the dictionary to the json file with open(json_file, mode="w") as json_f: json.dump(json_dict, json_f, indent=2) logger.info(f"{json_file} successfully created!")
def create_json(wav_lst, json_file, clean_folder, txt_folder, lexicon): """ Creates the json file given a list of wav files. Arguments --------- wav_lst : list The list of wav files. json_file : str The path of the output json file clean_folder : str The location of parallel clean samples. txt_folder : str The location of the transcript files. """ logger.debug(f"Creating json lists in {json_file}") # Processing all the wav files in the list json_dict = {} for wav_file in wav_lst: # ex:p203_122.wav # Example wav_file: p232_001.wav noisy_path, filename = os.path.split(wav_file) _, noisy_dir = os.path.split(noisy_path) _, clean_dir = os.path.split(clean_folder) noisy_rel_path = os.path.join("{data_root}", noisy_dir, filename) clean_rel_path = os.path.join("{data_root}", clean_dir, filename) # Reading the signal (to retrieve duration in seconds) signal = read_audio(wav_file) duration = signal.shape[0] / SAMPLERATE # Read text snt_id = filename.replace(".wav", "") with open(os.path.join(txt_folder, snt_id + ".txt")) as f: word_string = f.read() word_string = remove_punctuation(word_string).strip().upper() phones = [ phn for word in word_string.split() for phn in lexicon[word].split() ] # Remove duplicate phones phones = [i for i, j in zip(phones, phones[1:] + [None]) if i != j] phone_string = " ".join(phones) json_dict[snt_id] = { "noisy_wav": noisy_rel_path, "clean_wav": clean_rel_path, "length": duration, "words": word_string, "phones": phone_string, } # Writing the json lines with open(json_file, mode="w") as json_f: json.dump(json_dict, json_f, indent=2) logger.info(f"{json_file} successfully created!")
def generate_silence_data(num_known_samples_per_split, splits, data_folder, percentage_silence=26): """Generates silence samples. Arguments --------- num_known_samples_per_split: int Total number of samples of known words for each split (i.e. set). splits: str Training, validation and test sets. data_folder: str path to dataset. percentage_silence: int How many silence samples to generate; relative to the total number of known words. """ for split in splits: num_silence_samples = int( (percentage_silence / 100.0) * num_known_samples_per_split[split]) # Fetch all background noise wav files used to generate silence samples search_path = os.path.join(data_folder, "_background_noise_", "*.wav") silence_paths = [] for wav_path in glob.glob(search_path): silence_paths.append(wav_path) # Generate random silence samples # Assumes that the pytorch seed has been defined in the HyperPyYaml file num_silence_samples_per_path = int(num_silence_samples / len(silence_paths)) for silence_path in silence_paths: signal = read_audio(silence_path) random_starts = ((torch.rand(num_silence_samples_per_path) * (signal.shape[0] - 16001)).type( torch.int).tolist()) for i, random_start in enumerate(random_starts): splits[split]["ID"].append( re.sub( r".wav", "/" + str(random_start) + "_" + str(i), re.sub(r".+?(?=_background_noise_)", "", silence_path), )) splits[split]["duration"].append(1.0) splits[split]["start"].append(random_start) splits[split]["stop"].append(random_start + 16000) splits[split]["wav"].append(silence_path) splits[split]["spk_id"].append(None) splits[split]["command"].append("silence") splits[split]["transcript"].append(None)
def test_read_audio(tmpdir): from speechbrain.dataio.dataio import read_audio, write_audio test_waveform = torch.rand(16000) wavfile = os.path.join(tmpdir, "wave.wav") write_audio(wavfile, test_waveform, 16000) # dummy annotation for i in range(3): start = torch.randint(0, 8000, (1, )).item() stop = start + torch.randint(500, 1000, (1, )).item() wav_obj = {"wav": {"file": wavfile, "start": start, "stop": stop}} loaded = read_audio(wav_obj["wav"]) assert loaded.allclose(test_waveform[start:stop], atol=1e-4)
def generalized_eigenvalue(audio_file, diffuse=True, show_plots=False): xs_speech = read_audio(audio_file) xs_speech = xs_speech.unsqueeze(0) stft = STFT(sample_rate=fs) cov = Covariance() gev = Gev() istft = ISTFT(sample_rate=fs) Xs = stft(xs_speech) SSs = cov(Xs) NNs = cov(Xs) Ys_gev = gev(Xs, SSs, NNs) ys_gev = istft(Ys_gev) if show_plots: plt.figure(1) plt.title("Noisy signal at microphone 1") plt.imshow( torch.transpose( torch.log(Xs[0, :, :, 0, 0] ** 2 + Xs[0, :, :, 1, 0] ** 2), 1, 0 ), origin="lower", ) plt.figure(2) plt.title("Noisy signal at microphone 1") plt.plot(xs_speech.squeeze()[:, 0]) plt.figure(3) plt.title("Beamformed signal") plt.imshow( torch.transpose( torch.log( Ys_gev[0, :, :, 0, 0] ** 2 + Ys_gev[0, :, :, 1, 0] ** 2 ), 1, 0, ), origin="lower", ) plt.figure(4) plt.title("Beamformed signal") plt.plot(ys_gev.squeeze()) plt.show() return ys_gev.squeeze()
def create_json(wav_lst, json_file, clean_folder): """ Creates the json file given a list of wav files. Arguments --------- wav_lst : list The list of wav files. json_file : str The path of the output json file clean_folder : str The location of parallel clean samples. """ logger.debug(f"Creating json lists in {json_file}") # Processing all the wav files in the list json_dict = {} for wav_file in wav_lst: # ex:p203_122.wav # Example wav_file: p232_001.wav noisy_path, filename = os.path.split(wav_file) _, noisy_dir = os.path.split(noisy_path) _, clean_dir = os.path.split(clean_folder) noisy_rel_path = os.path.join("{data_root}", noisy_dir, filename) clean_rel_path = os.path.join("{data_root}", clean_dir, filename) # Reading the signal (to retrieve duration in seconds) signal = read_audio(wav_file) duration = signal.shape[0] / SAMPLERATE # Read text snt_id = filename.replace(".wav", "") json_dict[snt_id] = { "noisy_wav": noisy_rel_path, "clean_wav": clean_rel_path, "length": duration, } # Writing the json lines with open(json_file, mode="w") as json_f: json.dump(json_dict, json_f, indent=2) logger.info(f"{json_file} successfully created!")
def delay_and_sum(audio_file, show_plots=False): xs_speech = read_audio(audio_file) xs_speech = xs_speech.unsqueeze(0) stft = STFT(sample_rate=fs) cov = Covariance() gccphat = GccPhat() delaysum = DelaySum() istft = ISTFT(sample_rate=fs) Xs = stft(xs_speech) XXs = cov(Xs) tdoas = gccphat(XXs) Ys_ds = delaysum(Xs, tdoas) ys_ds = istft(Ys_ds) if show_plots: plt.figure(1) plt.title("Noisy signal at microphone 1") plt.imshow( torch.transpose( torch.log(Xs[0, :, :, 0, 0]**2 + Xs[0, :, :, 1, 0]**2), 1, 0), origin="lower", ) plt.figure(2) plt.title("Noisy signal at microphone 1") plt.plot(xs_speech.squeeze()[:, 0]) plt.figure(3) plt.title("Beamformed signal") plt.imshow( torch.transpose( torch.log(Ys_ds[0, :, :, 0, 0]**2 + Ys_ds[0, :, :, 1, 0]**2), 1, 0, ), origin="lower", ) plt.figure(4) plt.title("Beamformed signal") plt.plot(ys_ds.squeeze()) return ys_ds.squeeze()
def create_json( wav_lst, json_file, uppercase, phn_set, ): """ Creates the json file given a list of wav files. Arguments --------- wav_lst : list The list of wav files of a given data split. json_file : str The path of the output json file. uppercase : bool Whether this is the uppercase version of timit. phn_set : {60, 48, 39}, optional, Default: 39 The phoneme set to use in the phn label. """ # Adding some Prints msg = "Creating %s..." % (json_file) logger.info(msg) json_dict = {} for wav_file in wav_lst: # Getting sentence and speaker ids spk_id = wav_file.split("/")[-2] snt_id = wav_file.split("/")[-1].replace(".wav", "") snt_id = spk_id + "_" + snt_id # Reading the signal (to retrieve duration in seconds) signal = read_audio(wav_file) duration = len(signal) / SAMPLERATE # Retrieving words and check for uppercase if uppercase: wrd_file = wav_file.replace(".WAV", ".WRD") else: wrd_file = wav_file.replace(".wav", ".wrd") if not os.path.exists(os.path.dirname(wrd_file)): err_msg = "the wrd file %s does not exists!" % (wrd_file) raise FileNotFoundError(err_msg) words = [line.rstrip("\n").split(" ")[2] for line in open(wrd_file)] words = " ".join(words) # Retrieving phonemes if uppercase: phn_file = wav_file.replace(".WAV", ".PHN") else: phn_file = wav_file.replace(".wav", ".phn") if not os.path.exists(os.path.dirname(phn_file)): err_msg = "the wrd file %s does not exists!" % (phn_file) raise FileNotFoundError(err_msg) # Getting the phoneme and ground truth ends lists from the phn files phonemes, ends = get_phoneme_lists(phn_file, phn_set) json_dict[snt_id] = { "wav": wav_file, "duration": duration, "spk_id": spk_id, "phn": phonemes, "wrd": words, "ground_truth_phn_ends": ends, } # Writing the dictionary to the json file with open(json_file, mode="w") as json_f: json.dump(json_dict, json_f, indent=2) logger.info(f"{json_file} successfully created!")
def create_json(metadata, audio_data_folder, folds_list, json_file): """ Creates the json file given a list of wav files. Arguments --------- metadata: dict A dictionary containing the UrbanSound8k metadata file modified for the SpeechBrain, such that keys are IDs (which are the .wav file names without the file extension). folds_list : list of int The list of folds [1,10] to include in this batch json_file : str The path of the output json file """ # Processing all the wav files in the list json_dict = {} for ID, sample_metadata in metadata.items(): fold_num = int(sample_metadata["fold"]) if fold_num in folds_list: # Reading the signal (to retrieve duration in seconds) wav_file = os.path.join( os.path.abspath(audio_data_folder), "fold" + str(fold_num) + "/", sample_metadata["slice_file_name"], ) try: signal = read_audio(wav_file) file_info = torchaudio.info(wav_file) # If we're using sox/soundfile backend, file_info will have the old type if isinstance(file_info, torchaudio.backend.common.AudioMetaData): duration = signal.shape[0] / file_info.sample_rate else: duration = signal.shape[0] / file_info[0].rate # Create entry for this sample ONLY if we have successfully read-in the file using SpeechBrain/torchaudio json_dict[ID] = { "wav": sample_metadata["slice_file_name"], "classID": int(sample_metadata["classID"]), "class_string": sample_metadata["class_string"], "salience": int(sample_metadata["salience"]), "fold": sample_metadata["fold"], "duration": duration, } except Exception: print( f"There was a problem reading the file:{wav_file}. Skipping duration field for it." ) logger.exception( f"There was a problem reading the file:{wav_file}. Skipping it." ) # Writing the dictionary to the json file # Need to make sure sub folder "manifest" exists, if not create it parent_dir = os.path.dirname(json_file) if not os.path.exists(parent_dir): os.mkdir(parent_dir) with open(json_file, mode="w") as json_f: json.dump(json_dict, json_f, indent=2) logger.info(f"{json_file} successfully created!")
def create_csv( wav_lst, csv_file, uppercase, data_folder, phn_set, kaldi_lab=None, kaldi_lab_opts=None, kaldi_lab_dir=None, ): """ Creates the csv file given a list of wav files. Arguments --------- wav_lst : list The list of wav files of a given data split. csv_file : str The path of the output csv file uppercase : bool Whether this is the uppercase version of timit. data_folder : str The location of the data. kaldi_lab : str, optional Default: None The path of the kaldi labels (optional). kaldi_lab_opts : str, optional Default: None A string containing the options used to compute the labels. Returns ------- None """ # Adding some Prints msg = "Creating csv lists in %s..." % (csv_file) logger.info(msg) # Reading kaldi labels if needed: snt_no_lab = 0 missing_lab = False if kaldi_lab is not None: lab = read_kaldi_lab( kaldi_lab, kaldi_lab_opts, ) if not os.path.exists(kaldi_lab_dir): os.makedirs(kaldi_lab_dir) csv_lines = [[ "ID", "duration", "wav", "wav_format", "wav_opts", "spk_id", "spk_id_format", "spk_id_opts", "phn", "phn_format", "phn_opts", "wrd", "wrd_format", "wrd_opts", "ground_truth_phn_ends", "ground_truth_phn_ends_format", "ground_truth_phn_ends_opts", ]] if kaldi_lab is not None: csv_lines[0].append("kaldi_lab") csv_lines[0].append("kaldi_lab_format") csv_lines[0].append("kaldi_lab_opts") # Processing all the wav files in the list for wav_file in wav_lst: # Getting sentence and speaker ids spk_id = wav_file.split("/")[-2] snt_id = wav_file.split("/")[-1].replace(".wav", "") snt_id = spk_id + "_" + snt_id if kaldi_lab is not None: if snt_id not in lab.keys(): missing_lab = False msg = ("The sentence %s does not have a corresponding " "kaldi label" % (snt_id)) logger.info(msg) snt_no_lab = snt_no_lab + 1 else: snt_lab_path = os.path.join(kaldi_lab_dir, snt_id + ".pkl") save_pkl(lab[snt_id], snt_lab_path) # If too many kaldi labels are missing rise an error if snt_no_lab / len(wav_lst) > 0.05: err_msg = ("Too many sentences do not have the " "corresponding kaldi label. Please check data and " "kaldi labels (check %s and %s)." % (data_folder, kaldi_lab)) logger.debutg(err_msg) if missing_lab: continue # Reading the signal (to retrieve duration in seconds) signal = read_audio(wav_file) duration = len(signal) / SAMPLERATE # Retrieving words and check for uppercase if uppercase: wrd_file = wav_file.replace(".WAV", ".WRD") else: wrd_file = wav_file.replace(".wav", ".wrd") if not os.path.exists(os.path.dirname(wrd_file)): err_msg = "the wrd file %s does not exists!" % (wrd_file) raise FileNotFoundError(err_msg) words = [line.rstrip("\n").split(" ")[2] for line in open(wrd_file)] words = " ".join(words) # Retrieving phonemes if uppercase: phn_file = wav_file.replace(".WAV", ".PHN") else: phn_file = wav_file.replace(".wav", ".phn") if not os.path.exists(os.path.dirname(phn_file)): err_msg = "the wrd file %s does not exists!" % (phn_file) raise FileNotFoundError(err_msg) # Getting the phoneme and ground truth ends lists from the phn files phonemes, ends = get_phoneme_lists(phn_file, phn_set) # Composition of the csv_line csv_line = [ snt_id, str(duration), wav_file, "wav", "", spk_id, "string", "", str(phonemes), "string", "", str(words), "string", "label:False", str(ends), "string", "label:False", ] if kaldi_lab is not None: csv_line.append(snt_lab_path) csv_line.append("pkl") csv_line.append("") # Adding this line to the csv_lines list csv_lines.append(csv_line) # Writing the csv lines _write_csv(csv_lines, csv_file) msg = "%s sucessfully created!" % (csv_file) logger.info(msg)
def get_wsj_files(wsj0root, output_dir, save_fs="wav8k", min_maxs=["min"]): """ This function constructs the wsj0-2mix dataset out of wsj0 dataset. (We are assuming that we have the wav files and not the sphere format) Argument: wsj0root (str): This string specifies the root folder for the wsj0 dataset. output_dir (str): The string that species the save folder. save_fs (str): The string that specifies the saving sampling frequency, in ['wav8k', 'wav16k'] min_maxs (list): The list that contains the specification on whether we take min. or max. of signals to construct the mixtures. example: ["min", "max"] """ data_types = ["tr", "cv", "tt"] # train, valid and test sets from oct2py import octave filedir = os.path.dirname(os.path.realpath(__file__)) octave.addpath( filedir + "/meta" ) # add the matlab functions to octave dir here fs_read = 8000 if save_fs == "wav8k" else 16000 if not os.path.exists(output_dir): os.mkdir(output_dir) if not os.path.exists(os.path.join(output_dir, save_fs)): os.mkdir(os.path.join(output_dir, save_fs)) log_dir = os.path.join(output_dir, save_fs + "/mixture_definitions_log") if not os.path.exists(log_dir): os.mkdir(log_dir) # get the the text files in the current working directory filelinks = [ "https://www.dropbox.com/s/u5gk5h3htzw4cgo/mix_2_spk_tr.txt?dl=1", "https://www.dropbox.com/s/s3s6311d95n4sip/mix_2_spk_cv.txt?dl=1", "https://www.dropbox.com/s/9kdxb2uz18a5k9d/mix_2_spk_tt.txt?dl=1", ] for filelink, data_type in zip(filelinks, data_types): filepath = os.path.join( filedir, "meta", "mix_2_spk_" + data_type + ".txt" ) if not os.path.exists(filepath): download_file(filelink, filepath) inner_folders = ["s1", "s2", "mix"] for min_max in min_maxs: for data_type in data_types: save_dir = os.path.join( output_dir, save_fs + "/" + min_max + "/" + data_type ) if not os.path.exists( os.path.join(output_dir, save_fs + "/" + min_max) ): os.mkdir(os.path.join(output_dir, save_fs + "/" + min_max)) if not os.path.exists(save_dir): os.mkdir(save_dir) for inner_folder in inner_folders: if not os.path.exists(os.path.join(save_dir, inner_folder)): os.mkdir(os.path.join(save_dir, inner_folder)) TaskFile = os.path.join( filedir, "meta", "mix_2_spk_" + data_type + ".txt" ) Source1File, Source2File, MixFile, C = arrange_task_files( TaskFile, min_max, data_type, log_dir ) fid_s1 = open(Source1File, "w") fid_s2 = open(Source2File, "w") fid_m = open(MixFile, "w") num_files = len(C) print("{} \n".format(min_max + "_" + data_type)) for i, line in tqdm(enumerate(C)): _, inwav1_dir, _, inwav1_name = line[0].split("/") _, inwav2_dir, _, inwav2_name = line[2].split("/") # write the log data to the log files fid_s1.write("{}\n".format(line[0])) fid_s2.write("{}\n".format(line[2])) inwav1_snr = line[1] inwav2_snr = line[3] mix_name = ( inwav1_name + "_" + str(inwav1_snr) + "_" + inwav2_name + "_" + str(inwav2_snr) ) fid_m.write("{}\n".format(mix_name)) fs, _ = wavfile.read(os.path.join(wsj0root, line[0])) s1 = read_audio(os.path.join(wsj0root, line[0])) s2 = read_audio(os.path.join(wsj0root, line[2])) # resample, determine levels for source 1 s1_8k = signal.resample(s1, int((fs_read / fs) * len(s1))) out = octave.activlev(s1_8k, fs_read, "n") s1_8k, lev1 = out[:-1].squeeze(), out[-1] # print('lev1 {}'.format(lev1)) # resample, determine levels for source 2 s2_8k = signal.resample(s2, int((fs_read / fs) * len(s2))) out = octave.activlev(s2_8k, fs_read, "n") s2_8k, lev2 = out[:-1].squeeze(), out[-1] weight_1 = 10 ** (float(inwav1_snr) / 20) weight_2 = 10 ** (float(inwav2_snr) / 20) # apply same gain to 16 kHz file if save_fs == "wav8k": s1_8k = weight_1 * s1_8k s2_8k = weight_2 * s2_8k scaling_8k, scaling16bit_8k = save_mixture( s1_8k, s2_8k, min_max, weight_1, weight_2, num_files, lev1, lev2, save_fs, output_dir, data_type, mix_name, i, ) elif save_fs == "wav16k": s1_16k = weight_1 * s1 / np.sqrt(lev1) s2_16k = weight_2 * s2 / np.sqrt(lev2) scaling_16k, scaling16bit_16k = save_mixture( s1_16k, s2_16k, min_max, weight_1, weight_2, num_files, lev1, lev2, save_fs, output_dir, data_type, mix_name, i, ) else: raise ValueError("Incorrect sampling frequency for saving") if save_fs == "wav8k": pickle.dump( { "scaling_8k": scaling_8k, "scaling8bit_8k": scaling16bit_8k, }, open( output_dir + "/" + save_fs + "/" + min_max + "/" + data_type + "/scaling.pkl", "wb", ), ) elif save_fs == "wav16k": pickle.dump( { "scaling_16k": scaling_16k, "scaling16bit_16k": scaling16bit_16k, }, open( output_dir + "/" + save_fs + "/" + min_max + "/" + data_type + "/scaling.pkl", "wb", ), ) else: raise ValueError("Incorrect sampling frequency for saving")
def create_csv( csv_file, wav_lst, seg_size=None, has_target=False, noise_csv=None, noisy_folder=None, noise_snr_low=0, noise_snr_high=0, ): """ Creates the csv file given a list of wav files. Arguments --------- csv_file : str The path of the output csv file wav_lst : list The list of wav files of a given data split. seg_size : int Split the file into multiple fix length segments (ms). has_target : bool Whether clean utterances are present in a similar directory. noise_csv : str A set of noise files to mix with the signals in `wav_lst`. noisy_folder : str A location for storing the mixed samples, if `noise_csv` is provided. noise_snr_low : float The lowest amplitude ratio to use when mixing `noise_csv`. noise_snr_high : float The highest amplitude ratio to use when mixing `noise_csv`. """ if noise_csv and has_target: raise ValueError("Expected only one of `noise_csv` and `has_target`") logger.info("Creating csv list: %s" % csv_file) csv_lines = [["ID", "duration", "wav", "wav_format", "wav_opts"]] if noise_csv or has_target: csv_lines[0].extend(["target", "target_format", "target_opts"]) if noise_csv: if not os.path.exists(noisy_folder): os.makedirs(noisy_folder) noise_adder = AddNoise( csv_file=noise_csv, snr_low=noise_snr_low, snr_high=noise_snr_high, pad_noise=True, normalize=True, ) # Processing all the wav files in the list fileid = 0 for wav_file in wav_lst: full_file_name = os.path.basename(wav_file) if has_target: fileid = full_file_name.split("_")[-1] target_folder = os.path.join( os.path.split(os.path.split(wav_file)[0])[0], "clean") target_file = os.path.join(target_folder, "clean_fileid_" + fileid) # Reading the signal (to retrieve duration in seconds) signal = read_audio(wav_file) duration = signal.shape[0] / SAMPLERATE if noise_csv: target = torch.Tensor(signal).unsqueeze(0) signal = noise_adder(target, torch.ones(1)) filepath = os.path.join(noisy_folder, full_file_name) torchaudio.save(filepath, signal, SAMPLERATE) target_file = wav_file wav_file = filepath # Composition of the csv_line if not seg_size or duration < seg_size: csv_line = [full_file_name, str(duration), wav_file, "wav", ""] if noise_csv or has_target: csv_line.extend([target_file, "wav", ""]) csv_lines.append(csv_line) else: for idx in range(int(duration // seg_size)): start = int(idx * seg_size * SAMPLERATE) stop = int((idx + 1) * seg_size * SAMPLERATE) csv_line = [ full_file_name + str(idx), str(seg_size), wav_file, "wav", "start:{} stop:{}".format(start, stop), ] if noise_csv or has_target: csv_line.extend([ target_file, "wav", "start:{} stop:{}".format(start, stop), ]) # Adding this line to the csv_lines list csv_lines.append(csv_line) # Writing the csv lines _write_csv(csv_lines, csv_file) logger.info("%s successfully created!" % csv_file)
def prepare_TAS(data_folder, save_folder, type, train_splits, skip_prep=False): """ This function prepares the Timers and Such dataset. If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded. data_folder : path to Timers and Such dataset. save_folder: path there to save the csv manifest files. type : one of the following: "direct":{input=audio, output=semantics} "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle) "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts) train_splits : list of splits to be joined to form train .csv skip_prep: If True, skip data preparation """ if skip_prep: return if type == "decoupled": try: import inflect p = inflect.engine() except ModuleNotFoundError: logger.info( 'Error: the inflect module must be installed to run the "decoupled" SLU recipe.' ) logger.info("Install using `pip install inflect`.") raise # If the data folders do not exist, we need to extract the data if not os.path.isdir(os.path.join(data_folder, "train-synth")): # Check for zip file and download if it doesn't exist zip_location = os.path.join(data_folder, "timers-and-such.zip") if not os.path.exists(zip_location): url = "https://zenodo.org/record/4623772/files/timers-and-such-v1.0.zip?download=1" download_file(url, zip_location, unpack=True) else: logger.info("Extracting timers-and-such.zip...") shutil.unpack_archive(zip_location, data_folder) splits = [ "train-real", "dev-real", "test-real", "train-synth", "dev-synth", "test-synth", ] ID_start = 0 # needed to have a unique ID for each audio for split in splits: new_filename = os.path.join(save_folder, split) + "-type=%s.csv" % type if os.path.exists(new_filename): continue logger.info("Preparing %s..." % new_filename) ID = [] duration = [] wav = [] wav_format = [] wav_opts = [] spk_id = [] spk_id_format = [] spk_id_opts = [] semantics = [] semantics_format = [] semantics_opts = [] transcript = [] transcript_format = [] transcript_opts = [] df = pd.read_csv(os.path.join(data_folder, split) + ".csv") for i in range(len(df)): ID.append(ID_start + i) signal = read_audio(os.path.join(data_folder, df.path[i])) duration.append(signal.shape[0] / 16000) wav.append(os.path.join(data_folder, df.path[i])) wav_format.append("wav") wav_opts.append(None) spk_id.append(df.speakerId[i]) spk_id_format.append("string") spk_id_opts.append(None) transcript_ = df.transcription[i] if type == "decoupled": words = transcript_.split() for w in range(len(words)): words[w] = words[w].upper() # If the word is numeric, we need to convert it to letters, to match what the ASR would output. if any(c.isdigit() for c in words[w]): if "AM" in words[w] or "PM" in words[w]: AM_or_PM = "A M" if "AM" in words[w] else "P M" if ":" in words[w]: hour = words[w].split(":")[0] minute = ( words[w].split(":")[1].split("AM")[0] if "AM" in words[w] else words[w].split(":")[1].split("PM")[0]) words[w] = (p.number_to_words(hour).upper() + " " + p.number_to_words(minute).upper() + " " + AM_or_PM) else: hour = (words[w].split("AM")[0] if "AM" in words[w] else words[w].split("PM")[0]) words[w] = (p.number_to_words(hour).upper() + " " + AM_or_PM) else: words[w] = p.number_to_words(words[w]).upper() transcript_ = " ".join(words).replace("-", " ") transcript.append(transcript_) transcript_format.append("string") transcript_opts.append(None) semantics_ = df.semantics[i].replace( ".3333333333333333", ".33") # Fix formatting error in some labels if type == "direct" or type == "multistage" or type == "decoupled": semantics.append(semantics_) if type == "joint-transcript-semantics": semantics.append("{'transcript': '" + transcript_ + "'| " + semantics_[1:]) if type == "joint-semantics-transcript": semantics.append(semantics_[:-1] + "| 'transcript': '" + transcript_ + "'}") semantics_format.append("string") semantics_opts.append(None) new_df = pd.DataFrame({ "ID": ID, "duration": duration, "wav": wav, "spk_id": spk_id, "semantics": semantics, "transcript": transcript, }) new_df.to_csv(new_filename, index=False) ID_start += len(df) # Merge train splits train_splits = [split + "-type=%s.csv" % type for split in train_splits] merge_csvs(save_folder, train_splits, "train-type=%s.csv" % type) # Create "all-real" split real_splits = [ split + "-type=%s.csv" % type for split in ["train-real", "dev-real", "test-real"] ] merge_csvs(save_folder, real_splits, "all-real-type=%s.csv" % type)
def create_csv(wav_lst, csv_file, clean_folder, txt_folder, lexicon): """ Creates the csv file given a list of wav files. Arguments --------- wav_lst : list The list of wav files. csv_file : str The path of the output csv file clean_folder : str The location of parallel clean samples. txt_folder : str The location of the transcript files. """ logger.debug(f"Creating csv lists in {csv_file}") csv_lines = [["ID", "duration"]] csv_lines[0].extend(["noisy_wav", "noisy_wav_format", "noisy_wav_opts"]) csv_lines[0].extend(["clean_wav", "clean_wav_format", "clean_wav_opts"]) csv_lines[0].extend(["wrd", "wrd_format", "wrd_opts"]) csv_lines[0].extend(["phn", "phn_format", "phn_opts"]) csv_lines[0].extend(["biphn", "biphn_format", "biphn_opts"]) # Processing all the wav files in the list for wav_file in wav_lst: # ex:p203_122.wav # Example wav_file: p232_001.wav snt_id = os.path.basename(wav_file).replace(".wav", "") clean_wav = os.path.join(clean_folder, snt_id + ".wav") # Reading the signal (to retrieve duration in seconds) signal = read_audio(wav_file) duration = signal.shape[0] / SAMPLERATE # Read text snt_id = os.path.basename(wav_file).replace(".wav", "") with open(os.path.join(txt_folder, snt_id + ".txt")) as f: words = f.read() words = remove_punctuation(words).strip().upper() phones = " ".join([lexicon[word] for word in words.split()]) biphones = zip(["<B>"] + phones.split(), phones.split() + ["<E>"]) biphones = [phn1 + phn2 for phn1, phn2 in biphones] # Composition of the csv_line csv_line = [snt_id, str(duration)] csv_line.extend([wav_file, "wav", ""]) csv_line.extend([clean_wav, "wav", ""]) csv_line.extend([words, "string", ""]) csv_line.extend([phones, "string", ""]) csv_line.extend([biphones, "string", ""]) # Adding this line to the csv_lines list csv_lines.append(csv_line) # Writing the csv lines with open(csv_file, mode="w") as csv_f: csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in csv_lines: csv_writer.writerow(line) print(f"{csv_file} successfully created!")
def prepare_SLURP(data_folder, save_folder, slu_type, train_splits, skip_prep=False): """ This function prepares the SLURP dataset. If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded. data_folder : path to SLURP dataset. save_folder: path where to save the csv manifest files. slu_type : one of the following: "direct":{input=audio, output=semantics} "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle) "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts) train_splits : list of splits to be joined to form train .csv skip_prep: If True, data preprations is skipped. """ if skip_prep: return # If the data folders do not exist, we need to download/extract the data if not os.path.isdir(os.path.join(data_folder, "slurp_synth")): # Check for zip file and download if it doesn't exist zip_location = os.path.join(data_folder, "slurp_synth.tar.gz") if not os.path.exists(zip_location): url = "https://zenodo.org/record/4274930/files/slurp_synth.tar.gz?download=1" download_file(url, zip_location, unpack=True) else: print("Extracting slurp_synth...") shutil.unpack_archive(zip_location, data_folder) if not os.path.isdir(os.path.join(data_folder, "slurp_real")): # Check for zip file and download if it doesn't exist zip_location = os.path.join(data_folder, "slurp_real.tar.gz") if not os.path.exists(zip_location): url = "https://zenodo.org/record/4274930/files/slurp_real.tar.gz?download=1" download_file(url, zip_location, unpack=True) else: print("Extracting slurp_real...") shutil.unpack_archive(zip_location, data_folder) splits = [ "train_real", "train_synthetic", "devel", "test", ] id = 0 for split in splits: new_filename = (os.path.join(save_folder, split) + "-type=%s.csv" % slu_type) if os.path.exists(new_filename): continue print("Preparing %s..." % new_filename) IDs = [] duration = [] wav = [] wav_format = [] wav_opts = [] semantics = [] semantics_format = [] semantics_opts = [] transcript = [] transcript_format = [] transcript_opts = [] jsonl_path = os.path.join(data_folder, split + ".jsonl") if not os.path.isfile(jsonl_path): if split == "train_real": url_split = "train" else: url_split = split url = ( "https://github.com/pswietojanski/slurp/raw/master/dataset/slurp/" + url_split + ".jsonl") download_file(url, jsonl_path, unpack=False) with jsonlines.open(jsonl_path) as reader: for obj in reader: scenario = obj["scenario"] action = obj["action"] sentence_annotation = obj["sentence_annotation"] num_entities = sentence_annotation.count("[") entities = [] for slot in range(num_entities): type = (sentence_annotation.split("[")[slot + 1].split("]") [0].split(":")[0].strip()) filler = (sentence_annotation.split("[")[slot + 1].split( "]")[0].split(":")[1].strip()) entities.append({"type": type, "filler": filler}) for recording in obj["recordings"]: IDs.append(id) if "synthetic" in split: audio_folder = "slurp_synth/" else: audio_folder = "slurp_real/" path = os.path.join(data_folder, audio_folder, recording["file"]) signal = read_audio(path) duration.append(signal.shape[0] / 16000) wav.append(path) wav_format.append("flac") wav_opts.append(None) transcript_ = obj["sentence"] if slu_type == "decoupled": transcript_ = transcript_.upper() transcript.append(transcript_) transcript_format.append("string") transcript_opts.append(None) semantics_dict = { "scenario": scenario, "action": action, "entities": entities, } semantics_ = str(semantics_dict).replace( ",", "|" ) # Commas in dict will make using csv files tricky; replace with pipe. semantics.append(semantics_) semantics_format.append("string") semantics_opts.append(None) id += 1 df = pd.DataFrame({ "ID": IDs, "duration": duration, "wav": wav, "semantics": semantics, "transcript": transcript, }) df.to_csv(new_filename, index=False) # Merge train splits train_splits = [ split + "-type=%s.csv" % slu_type for split in train_splits ] merge_csvs(save_folder, train_splits, "train-type=%s.csv" % slu_type)
dir_test = set(dir_test) dir_test = [i[len("TIMIT_4_channels/test/") :] for i in dir_test] dir_test = [i for i in dir_test if len(i) > 3] for i in dir_test: os.makedirs("TIMIT_combined/test/" + i) for i in range(len(train_df)): if i % 4 == 0: fname = ( train_df["location"][i][len("TIMIT_4_channels/train/") :] .split(".")[0] .split("_")[0] + ".wav" ) mic1 = read_audio(train_df["location"][i]) mic2 = read_audio(train_df["location"][i + 1]) mic3 = read_audio(train_df["location"][i + 2]) mic4 = read_audio(train_df["location"][i + 3]) sa = torch.stack((mic1, mic2, mic3, mic4)).transpose(0, 1) write_audio("TIMIT_combined/train/" + fname, sa, samplerate=fs) for i in range(len(test_df)): if i % 4 == 0: fname = ( test_df["location"][i][len("TIMIT_4_channels/test/") :] .split(".")[0] .split("_")[0] + ".wav" ) mic1 = read_audio(test_df["location"][i])
def prepare_aishell(data_folder, save_folder, skip_prep=False): """ This function prepares the AISHELL-1 dataset. If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded. data_folder : path to AISHELL-1 dataset. save_folder: path where to store the manifest csv files. skip_prep: If True, skip data preparation. """ if skip_prep: return # If the data folders do not exist, we need to extract the data if not os.path.isdir(os.path.join(data_folder, "data_aishell/wav")): # Check for zip file and download if it doesn't exist zip_location = os.path.join(data_folder, "data_aishell.tgz") if not os.path.exists(zip_location): url = "https://www.openslr.org/resources/33/data_aishell.tgz" download_file(url, zip_location, unpack=True) logger.info("Extracting data_aishell.tgz...") shutil.unpack_archive(zip_location, data_folder) wav_dir = os.path.join(data_folder, "data_aishell/wav") tgz_list = glob.glob(wav_dir + "/*.tar.gz") for tgz in tgz_list: shutil.unpack_archive(tgz, wav_dir) os.remove(tgz) # Create filename-to-transcript dictionary filename2transcript = {} with open( os.path.join( data_folder, "data_aishell/transcript/aishell_transcript_v0.8.txt" ), "r", ) as f: lines = f.readlines() for line in lines: key = line.split()[0] value = " ".join(line.split()[1:]) filename2transcript[key] = value splits = [ "train", "dev", "test", ] ID_start = 0 # needed to have a unique ID for each audio for split in splits: new_filename = os.path.join(save_folder, split) + ".csv" if os.path.exists(new_filename): continue logger.info("Preparing %s..." % new_filename) ID = [] duration = [] wav = [] wav_format = [] wav_opts = [] # spk_id = [] # spk_id_format = [] # spk_id_opts = [] transcript = [] transcript_format = [] transcript_opts = [] all_wavs = glob.glob( os.path.join(data_folder, "data_aishell/wav") + "/" + split + "/*/*.wav" ) for i in range(len(all_wavs)): filename = all_wavs[i].split("/")[-1].split(".wav")[0] if filename not in filename2transcript: continue transcript_ = filename2transcript[filename] transcript.append(transcript_) transcript_format.append("string") transcript_opts.append(None) ID.append(ID_start + i) signal = read_audio(all_wavs[i]) duration.append(signal.shape[0] / 16000) wav.append(all_wavs[i]) wav_format.append("wav") wav_opts.append(None) # spk_id.append(df.speakerId[i]) # spk_id_format.append("string") # spk_id_opts.append(None) new_df = pd.DataFrame( { "ID": ID, "duration": duration, "wav": wav, "transcript": transcript, } ) new_df.to_csv(new_filename, index=False) ID_start += len(all_wavs)
def prepare_FSC(data_folder, skip_prep=False): """ This function prepares the Fluent Speech Commands dataset. data_folder : path to dataset. skip_prep: If True, skip data preparation """ if skip_prep: return splits = [ "train", "valid", "test", ] ID_start = 0 # needed to have a unique ID for each audio for split in splits: new_filename = os.path.join(data_folder, split) + ".csv" if os.path.exists(new_filename): continue logger.info("Preparing %s..." % new_filename) ID = [] duration = [] wav = [] wav_format = [] wav_opts = [] spk_id = [] spk_id_format = [] spk_id_opts = [] semantics = [] semantics_format = [] semantics_opts = [] transcript = [] transcript_format = [] transcript_opts = [] df = pd.read_csv( os.path.join(data_folder, "data", split) + "_data.csv") for i in range(len(df)): ID.append(ID_start + i) signal = read_audio(os.path.join(data_folder, df.path[i])) duration.append(signal.shape[0] / 16000) wav.append(os.path.join(data_folder, df.path[i])) wav_format.append("wav") wav_opts.append(None) spk_id.append(df.speakerId[i]) spk_id_format.append("string") spk_id_opts.append(None) transcript_ = df.transcription[i] transcript.append(transcript_) transcript_format.append("string") transcript_opts.append(None) semantics_ = ('{"action:" "' + df.action[i] + '"| "object": "' + df.object[i] + '"| "location": "' + df.location[i] + '"}') semantics.append(semantics_) semantics_format.append("string") semantics_opts.append(None) new_df = pd.DataFrame({ "ID": ID, "duration": duration, "wav": wav, "wav_format": wav_format, "wav_opts": wav_opts, "spk_id": spk_id, "spk_id_format": spk_id_format, "spk_id_opts": spk_id_opts, "semantics": semantics, "semantics_format": semantics_format, "semantics_opts": semantics_opts, "transcript": transcript, "transcript_format": transcript_format, "transcript_opts": transcript_opts, }) new_df.to_csv(new_filename, index=False) ID_start += len(df)
def audio_pipeline(wav): sig = read_audio(wav) return sig