def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir): source_dir = os.path.join(extracted_dir, data_set) target_dir = os.path.join(extracted_dir, dest_dir) if not os.path.exists(target_dir): os.makedirs(target_dir) # Loop over transcription files and split each one # # The format for each file 1-2.trans.txt is: # 1-2-0 transcription of 1-2-0.flac # 1-2-1 transcription of 1-2-1.flac # ... # # Each file is then split into several files: # 1-2-0.txt (contains transcription of 1-2-0.flac) # 1-2-1.txt (contains transcription of 1-2-1.flac) # ... # # We also convert the corresponding FLACs to WAV in the same pass files = [] for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_filename = os.path.join(root, filename) with codecs.open(trans_filename, "r", "utf-8") as fin: for line in fin: # Parse each segment line first_space = line.find(" ") seqid, transcript = line[:first_space], line[first_space + 1 :] # We need to do the encode-decode dance here because encode # returns a bytes() object on Python 3, and text_to_char_array # expects a string. transcript = ( unicodedata.normalize("NFKD", transcript) .encode("ascii", "ignore") .decode("ascii", "ignore") ) transcript = transcript.lower().strip() # Convert corresponding FLAC to a WAV flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not os.path.exists(wav_file): tfm = Transformer() tfm.set_output_format(rate=SAMPLE_RATE) tfm.build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append((os.path.abspath(wav_file), wav_filesize, transcript)) return pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"] )
def preprocess_wav(cls, fpath: Union[str, Path]) -> np.ndarray: """Load, resample, normalize and trim a waveform.""" transformer = Transformer() transformer.norm() transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=cls.sample_rate, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(fpath)) wav = wav / (2**15) return wav.astype(np.float32)
def loadFile(data, max_timestep): transformer = Transformer() transformer.norm() # transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=16000, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(data)) wav = torch.tensor(wav / (2**15)).float() length = len(wav) if length > max_timestep: start = 0 end = max_timestep length = max_timestep wav = wav[start:end] length = torch.tensor(length).long() return wav, length
def sph_to_wav(source_dir, target_dir): """Convert .sph files to .wav files.""" assert path.exists(source_dir) is True if not path.exists(target_dir): makedirs(target_dir) for sph_file in glob(path.join(source_dir, "*.sph")): transformer = Transformer() if hp.tedlium_rate != 16000: transformer.set_output_format(encoding='signed-integer', channels=1, rate=hp.tedlium_rate) wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav" wav_file = path.join(target_dir, wav_filename) transformer.build(sph_file, wav_file)
def compressed_wav_to_full(source_dir, target_dir): """Convert compressed wav files to full wav files.""" assert path.exists(source_dir) is True if not path.exists(target_dir): makedirs(target_dir) for compressed_file in glob(path.join(source_dir, "*.wav")): transformer = Transformer() if hp.callhome_rate == 8000: transformer.set_output_format(encoding='signed-integer', channels=1) # Also set single channel. else: # Do resampling if specified. transformer.set_output_format(encoding='signed-integer', channels=1, rate=hp.callhome_rate) wav_filename = path.basename(compressed_file) wav_file = path.join(target_dir, wav_filename) transformer.build(compressed_file, wav_file)
def loadFile_thread_exec(data): wavs = [] lengths = [] for i in range(len(data)): fullPath = data[i] transformer = Transformer() transformer.norm() transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=16000, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(fullPath)) wav = torch.tensor(wav / (2**15)).float() length = len(wav) if length > max_timestep: start = random.randint(0, int(length - max_timestep)) end = start + max_timestep length = max_timestep wav = wav[start:end] wavs.append(wav) lengths.append(torch.tensor(length).long()) return wavs, lengths
# !/usr/bin/python3 import os import tqdm from sox import Transformer SAMPLE_RATE = 16000 remove_flac = False path = '/home/dsmolen/agh/LibriSpeech/' i = 0 tq = tqdm.tqdm(os.walk(path, topdown=False)) for root, dirs, files in tq: for name in files: if name.endswith('.flac'): tq.set_postfix(converted=i) i += 1 name = name[:-5] flac_file = os.path.join(root, name + ".flac") wav_file = os.path.join(root, name + ".wav") if not os.path.exists(wav_file): tfm = Transformer() tfm.set_output_format(rate=SAMPLE_RATE) tfm.build(flac_file, wav_file) if remove_flac: os.remove(flac_file)
import tensorflow as tf import pandas as pd import os import unicodedata import tqdm import logging import librosa import numpy as np import soundfile from sklearn.model_selection import train_test_split logging.basicConfig(level=logging.NOTSET) logging.getLogger('sox').setLevel(logging.ERROR) FLAGS = tf.compat.v1.app.flags.FLAGS tfm = Transformer() tfm.set_output_format(rate=16000) def main(_): source_dir = FLAGS.source_dir data = [] df_details = pd.read_csv(os.path.join(source_dir, "validated.tsv"), sep="\t", header=0) with tqdm.tqdm(total=len(df_details.index)) as bar: for i in df_details.index: file_name = df_details["path"][i] source_file = os.path.join(source_dir, "clips/" + file_name) wav_file = os.path.join( os.path.dirname(__file__), "../data/common-voice-mozilla/Common-Voice-Mozilla/wav-files/"