def test_switch(self): torchaudio.set_audio_backend(self.backend) if self.backend is None: assert torchaudio.get_audio_backend() is None else: assert torchaudio.get_audio_backend() == self.backend assert torchaudio.load == self.backend_module.load assert torchaudio.save == self.backend_module.save assert torchaudio.info == self.backend_module.info
def AudioBackendScope(new_backend): previous_backend = torchaudio.get_audio_backend() try: torchaudio.set_audio_backend(new_backend) yield finally: torchaudio.set_audio_backend(previous_backend)
def __getitem__(self, index): for (file, _), examples in zip(self.files, self.num_examples): if index >= examples: index -= examples continue num_frames = 0 offset = 0 if self.length is not None: offset = self.stride * index num_frames = self.length if torchaudio.get_audio_backend() in ['soundfile', 'sox_io']: out, sr = torchaudio.load(str(file), frame_offset=offset, num_frames=num_frames or -1) else: out, sr = torchaudio.load(str(file), offset=offset, num_frames=num_frames) if self.sample_rate is not None: if sr != self.sample_rate: raise RuntimeError( f"Expected {file} to have sample rate of " f"{self.sample_rate}, but got {sr}") if num_frames: out = F.pad(out, (0, num_frames - out.shape[-1])) if self.with_path: return out, file else: return out
def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality and load individual sentences from a full ted audio talk file. Args: path (str): Path to audio file start_time (int, optional): Time in seconds where the sample sentence stars end_time (int, optional): Time in seconds where the sample sentence finishes Returns: [Tensor, int]: Audio tensor representation and sample rate """ start_time = int(float(start_time) * sample_rate) end_time = int(float(end_time) * sample_rate) backend = torchaudio.get_audio_backend() if backend == "sox" or (backend == "soundfile" and torchaudio.USE_SOUNDFILE_LEGACY_INTERFACE): kwargs = { "offset": start_time, "num_frames": end_time - start_time } else: kwargs = { "frame_offset": start_time, "num_frames": end_time - start_time } return torchaudio.load(path, **kwargs)
def __init__(self, root_dir, sr, duration=None, transform=None): self.sr = sr self.duration = duration self.transform = transform self.offsets = [0] self.rates = [] self.paths = sorted(list(pathlib.Path(root_dir).glob('**/*.wav'))) for p in self.paths: si, _ = torchaudio.info(str(p)) self.rates.append(si.rate) if self.duration is None: self.offsets.append(self.offsets[-1] + 1) continue if torchaudio.get_audio_backend() in ('sox', 'sox_io'): n_frames = si.length // si.channels elif torchaudio.get_audio_backend() == 'soundfile': n_frames = si.length n_segments = math.floor(n_frames / si.rate / self.duration) self.offsets.append(self.offsets[-1] + n_segments)
def read_audio(path: str, target_sr: int = 16000): assert torchaudio.get_audio_backend() == 'soundfile' wav, sr = torchaudio.load(path) if wav.size(0) > 1: wav = wav.mean(dim=0, keepdim=True) if sr != target_sr: transform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr) wav = transform(wav) sr = target_sr assert sr == target_sr return wav.squeeze(0)
def load_info(path: str) -> dict: """Load audio metadata this is a backend_independent wrapper around torchaudio.info Args: path: Path of filename Returns: Dict: Metadata with `samplerate`, `samples` and `duration` in seconds """ # get length of file in samples if torchaudio.get_audio_backend() == "sox": raise RuntimeError("Deprecated backend is not supported") info = {} si = torchaudio.info(str(path)) info["samplerate"] = si.sample_rate info["samples"] = si.num_frames info["channels"] = si.num_channels info["duration"] = info["samples"] / info["samplerate"] return info
def _load_audio(self, path: str, start_time: float, end_time: float, sample_rate: int = 16000) -> [Tensor, int]: """Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality and load individual sentences from a full ted audio talk file. Args: path (str): Path to audio file start_time (int, optional): Time in seconds where the sample sentence stars end_time (int, optional): Time in seconds where the sample sentence finishes Returns: [Tensor, int]: Audio tensor representation and sample rate """ start_time = int(float(start_time) * sample_rate) end_time = int(float(end_time) * sample_rate) if torchaudio.get_audio_backend() == "sox_io": return torchaudio.load(path, frame_offset=start_time, num_frames=end_time - start_time) return torchaudio.load(path)[:, start_time:end_time]
# I/O Parameters parser.add_argument( "--seq-dur", type=float, default=5.0, help="Duration of <=0.0 will result in the full audio", ) parser.add_argument("--batch-size", type=int, default=16) args, _ = parser.parse_known_args() torchaudio.set_audio_backend(args.audio_backend) train_dataset, valid_dataset, args = load_datasets(parser, args) print("Audio Backend: ", torchaudio.get_audio_backend()) # Iterate over training dataset and compute statistics total_training_duration = 0 for k in tqdm.tqdm(range(len(train_dataset))): x, y = train_dataset[k] total_training_duration += x.shape[1] / train_dataset.sample_rate if args.save: torchaudio.save("test/" + str(k) + "x.wav", x.T, train_dataset.sample_rate) torchaudio.save("test/" + str(k) + "y.wav", y.T, train_dataset.sample_rate) print("Total training duration (h): ", total_training_duration / 3600) print("Number of train samples: ", len(train_dataset)) print("Number of validation samples: ", len(valid_dataset))
def create_csv(orig_tsv_file, csv_file, data_folder, accented_letters=False, language="en"): """ Creates the csv file given a list of wav files. Arguments --------- orig_tsv_file : str Path to the Common Voice tsv file (standard file). data_folder : str Path of the CommonVoice dataset. accented_letters : bool, optional Defines if accented letters will be kept as individual letters or transformed to the closest non-accented letters. Returns ------- None """ # Check if the given files exists if not os.path.isfile(orig_tsv_file): msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file) logger.info(msg) raise FileNotFoundError(msg) # We load and skip the header loaded_csv = open(orig_tsv_file, "r").readlines()[1:] nb_samples = str(len(loaded_csv)) msg = "Preparing CSV files for %s samples ..." % (str(nb_samples)) logger.info(msg) # Adding some Prints msg = "Creating csv lists in %s ..." % (csv_file) logger.info(msg) csv_lines = [["ID", "duration", "wav", "spk_id", "wrd"]] # Start processing lines total_duration = 0.0 for line in tzip(loaded_csv): line = line[0] # Path is at indice 1 in Common Voice tsv files. And .mp3 files # are located in datasets/lang/clips/ mp3_path = data_folder + "/clips/" + line.split("\t")[1] file_name = mp3_path.split(".")[-2].split("/")[-1] spk_id = line.split("\t")[0] snt_id = file_name # Setting torchaudio backend to sox-io (needed to read mp3 files) if torchaudio.get_audio_backend() != "sox_io": logger.warning( "This recipe needs the sox-io backend of torchaudio") logger.warning("The torchaudio backend is changed to sox_io") torchaudio.set_audio_backend("sox_io") # Reading the signal (to retrieve duration in seconds) if os.path.isfile(mp3_path): info = torchaudio.info(mp3_path) else: msg = "\tError loading: %s" % (str(len(file_name))) logger.info(msg) continue duration = info.num_frames / info.sample_rate total_duration += duration # Getting transcript words = line.split("\t")[2] # Unicode Normalization words = unicode_normalisation(words) # !! Language specific cleaning !! # Important: feel free to specify the text normalization # corresponding to your alphabet. if language in ["en", "fr", "it", "rw"]: words = re.sub("[^’'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿéæœâçèàûî]+", " ", words).upper() if language == "fr": # Replace J'y D'hui etc by J_ D_hui words = words.replace("'", " ") words = words.replace("’", " ") elif language == "ar": HAMZA = "\u0621" ALEF_MADDA = "\u0622" ALEF_HAMZA_ABOVE = "\u0623" letters = ("ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ" + HAMZA + ALEF_MADDA + ALEF_HAMZA_ABOVE) words = re.sub("[^" + letters + "]+", " ", words).upper() elif language == "ga-IE": # Irish lower() is complicated, but upper() is nondeterministic, so use lowercase def pfxuc(a): return len(a) >= 2 and a[0] in "tn" and a[1] in "AEIOUÁÉÍÓÚ" def galc(w): return w.lower( ) if not pfxuc(w) else w[0] + "-" + w[1:].lower() words = re.sub("[^-A-Za-z'ÁÉÍÓÚáéíóú]+", " ", words) words = " ".join(map(galc, words.split(" "))) # Remove accents if specified if not accented_letters: words = strip_accents(words) words = words.replace("'", " ") words = words.replace("’", " ") # Remove multiple spaces words = re.sub(" +", " ", words) # Remove spaces at the beginning and the end of the sentence words = words.lstrip().rstrip() # Getting chars chars = words.replace(" ", "_") chars = " ".join([char for char in chars][:]) # Remove too short sentences (or empty): if len(words.split(" ")) < 3: continue # Composition of the csv_line csv_line = [snt_id, str(duration), mp3_path, spk_id, str(words)] # Adding this line to the csv_lines list csv_lines.append(csv_line) # Writing the csv lines with open(csv_file, mode="w", encoding="utf-8") as csv_f: csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in csv_lines: csv_writer.writerow(line) # Final prints msg = "%s successfully created!" % (csv_file) logger.info(msg) msg = "Number of samples: %s " % (str(len(loaded_csv))) logger.info(msg) msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2))) logger.info(msg)
import logging from typing import Iterable, List, Tuple import numpy as np import torch from torch import nn import torchaudio from pathos.threading import ThreadPool from torchaudio.transforms import MFCC, Resample import torchlibrosa logger = logging.getLogger() # Use sox_io backend if available if ( torchaudio.get_audio_backend() != "sox_io" and "sox_io" in torchaudio.list_audio_backends() ): torchaudio.set_audio_backend("sox_io") logger.debug("Set audio backend to sox_io") # Required because as of 0.7.2 on OSX, torchaudio links its own OpenMP runtime in addition to pytorch # This tells OpenMP not to crash when this happens. if sys.platform == "darwin": os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" class AudioTooShortError(ValueError): pass
def __init__(self, backend): self.new_backend = backend self.previous_backend = torchaudio.get_audio_backend()