def get_file_length(filepath): """ Returns the length of the sequence in the file specified by `filepath` """ signal_info, encoding_info = torchaudio.info(filepath) return signal_info.length
def __getitem__(self, index): """ :return: """ # Check the size of the file current_session = self.sessions.iloc[index] # TODO is this required ? nfo = torchaudio.info( f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}" ) original_start = int(current_session['start']) if self.overlap > 0: lowest_shift = self.overlap / 2 highest_shift = self.overlap / 2 if original_start < (current_session['file_start'] * self.sample_rate + self.sample_number / 2): lowest_shift = int(original_start - current_session['file_start'] * self.sample_rate) if original_start + self.sample_number > ( current_session['file_start'] + current_session['file_duration'] ) * self.sample_rate - self.sample_number / 2: highest_shift = int((current_session['file_start'] + current_session['file_duration']) * self.sample_rate - (original_start + self.sample_number)) start_frame = original_start + int( random.uniform(-lowest_shift, highest_shift)) else: start_frame = original_start conversion_rate = nfo.sample_rate // self.sample_rate if start_frame + conversion_rate * self.sample_number >= nfo.num_frames: start_frame = numpy.min(nfo.num_frames - conversion_rate * self.sample_number - 1) speech, speech_fs = torchaudio.load( f"{self.data_path}/{current_session['file_id']}{self.data_file_extension}", frame_offset=conversion_rate * start_frame, num_frames=conversion_rate * self.sample_number) if nfo.sample_rate != self.sample_rate: speech = torchaudio.transforms.Resample( nfo.sample_rate, self.sample_rate).forward(speech) speech += 10e-6 * torch.randn(speech.shape) if len(self.transform) > 0: speech = data_augmentation(speech, self.sample_rate, self.transform, self.transform_number, noise_df=self.noise_df, rir_df=self.rir_df) speaker_idx = current_session["speaker_idx"] if self.output_format == "pytorch": return speech, torch.tensor(speaker_idx) else: return speech, speaker_idx
def __init__(self, root_dir=''): super(Aff2CompDataset, self).__init__() self.video_dir = root_dir self.extracted_dir = os.path.join(self.video_dir, 'extracted') self.clip_len = 8 self.input_size = (112, 112) self.dilation = 6 self.label_frame = self.clip_len * self.dilation # audio params self.window_size = 20e-3 self.window_stride = 10e-3 self.sample_rate = 44100 num_fft = 2**math.ceil(math.log2(self.window_size * self.sample_rate)) window_fn = torch.hann_window self.sample_len_secs = 10 self.sample_len_frames = self.sample_len_secs * self.sample_rate self.audio_shift_sec = 5 self.audio_shift_samples = self.audio_shift_sec * self.sample_rate # transforms self.audio_transform = torchaudio.transforms.MelSpectrogram( sample_rate=self.sample_rate, n_mels=64, n_fft=num_fft, win_length=int(self.window_size * self.sample_rate), hop_length=int(self.window_stride * self.sample_rate), window_fn=window_fn) self.audio_spec_transform = ComposeWithInvert( [AmpToDB(), Normalize(mean=[-14.8], std=[19.895])]) self.clip_transform = ComposeWithInvert([ NumpyToTensor(), Normalize(mean=[0.43216, 0.394666, 0.37645, 0.5], std=[0.22803, 0.22145, 0.216989, 0.225]) ]) all_videos = find_all_video_files(self.video_dir) self.cached_metadata_path = os.path.join(self.video_dir, 'dataset.pkl') if not os.path.isfile(self.cached_metadata_path): print('creating cached_metadata... ') self.image_path = [] # paths relative to self.extracted_dir self.video_id = [] self.frame_id = [] self.label_au = [] self.label_ex = [] self.label_va = [] self.train_ids = [] self.val_ids = [] self.test_ids = [] self.features = [] self.feature_names = [] self.time_stamps = [] self.mask_available = False self.video_db_nr = [] video_db_nr = 0 for video in tqdm(all_videos): meta = Video(video).meta meta['filename'] = get_filename(video) meta['path'] = get_path(video) meta['extension'] = get_extension(video) num_frames_video = meta['num_frames'] audio_file = os.path.splitext(video)[0] + '.wav' si, ei = torchaudio.info(audio_file) assert si.rate == 44100 video_ts_file = os.path.join( meta['path'], meta['filename'] + '_video_ts.txt') if os.path.isfile(video_ts_file): pass else: mkvfile = os.path.join(meta['path'], 'temp.mkv') videofile = os.path.join( meta['path'], meta['filename'] + meta['extension']) command = 'mkvmerge -o ' + mkvfile + ' ' + videofile subprocess.call(command, shell=True) command = 'mkvextract ' + mkvfile + ' timestamps_v2 0:' + video_ts_file subprocess.call(command, shell=True) os.remove(mkvfile) with open(video_ts_file, 'r') as f: time_stamps = np.genfromtxt(f)[:num_frames_video] #os.remove(video_ts_file) self.mask_available = True extracted_dir = os.path.join(self.extracted_dir, meta['filename']) splits = [] if 'AU' in meta: au_split = meta['AU'] splits.append(au_split) if 'EX' in meta: ex_split = meta['EX'] splits.append(ex_split) if 'VA' in meta: va_split = meta['VA'] splits.append(va_split) splits = list(set(splits)) # UPDATED 03.06.2020 (was missing) for split in splits: self.time_stamps.append(time_stamps) for image_filename in sorted(os.listdir(extracted_dir)): if os.path.isdir( os.path.join(extracted_dir, image_filename)): continue # path relative to self.extracted_dir self.image_path.append( os.path.relpath( os.path.join(extracted_dir, image_filename), self.extracted_dir)) self.video_id.append(meta['filename']) self.video_db_nr.append( video_db_nr ) # UPDATED 03.06.2020 (avoids using frames from neighbour videos) frame_id = int(os.path.splitext(image_filename)[0]) self.frame_id.append(frame_id) # add your own label loading here if you want to use this for training self.label_au.append(None) self.label_ex.append(None) self.label_va.append(None) self.train_ids.append(1 if split == 'train' else 0) self.val_ids.append(1 if split == 'val' else 0) self.test_ids.append(1 if split == 'test' else 0) video_db_nr += 1 self.frame_id = np.stack(self.frame_id) self.label_au = np.stack(self.label_au) self.label_ex = np.stack(self.label_ex) self.label_va = np.stack(self.label_va) self.train_ids = np.stack(self.train_ids) self.val_ids = np.stack(self.val_ids) self.test_ids = np.stack(self.test_ids) self.time_stamps = np.hstack(self.time_stamps) with open(self.cached_metadata_path, 'wb') as f: pickle.dump( { 'frame_id': self.frame_id, 'label_au': self.label_au, 'label_ex': self.label_ex, 'label_va': self.label_va, 'video_id': self.video_id, 'image_path': self.image_path, 'train_ids': self.train_ids, 'val_ids': self.val_ids, 'test_ids': self.test_ids, 'time_stamps': self.time_stamps, 'mask_available': self.mask_available, 'video_db_nr': self.video_db_nr }, f) else: with open(self.cached_metadata_path, 'rb') as f: meta = pickle.load(f) self.frame_id = meta['frame_id'] self.label_au = meta['label_au'] self.label_ex = meta['label_ex'] self.label_va = meta['label_va'] self.video_id = meta['video_id'] self.image_path = meta['image_path'] self.train_ids = meta['train_ids'] self.val_ids = meta['val_ids'] self.time_stamps = meta['time_stamps'] self.mask_available = meta['mask_available'] self.test_ids = meta['test_ids'] self.video_db_nr = meta['video_db_nr'] self.validation_video_ids() self.test_video_ids() self.use_mask = self.mask_available
def py_info_func(filepath: str) -> torch.classes.torchaudio.SignalInfo: return torchaudio.info(filepath)
def inspect_file(path): print("-" * 10) print("Source:", path) print("-" * 10) print(f" - File size: {os.path.getsize(path)} bytes") print_metadata(torchaudio.info(path))
def py_info_func( filepath: str) -> torchaudio.backend.sox_io_backend.AudioMetaData: return torchaudio.info(filepath)
def d(ID): info = torchaudio.info( f'/work3/s164419/01005WakeWordData/lectures/{ID}.wav') return info.num_frames / info.sample_rate
def prepare_librispeech( corpus_dir: Pathlike, dataset_parts: Optional[Tuple[str]] = dataset_parts_mini, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param dataset_parts: dataset part name, e.g. 'train-clean-100', 'train-clean-5', 'dev-clean' :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata = {} part_path = corpus_dir / part for trans_path in part_path.rglob('*.txt'): with open(trans_path) as f: for line in f: idx, text = line.split(maxsplit=1) audio_path = part_path / Path(idx.replace( '-', '/')).parent / f'{idx}.flac' if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = LibriSpeechMetaData( audio_path=audio_path, audio_info=info[0], text=text) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording(id=idx, sources=[ AudioSource(type='file', channels=[0], source=str(metadata[idx].audio_path)) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate)) for idx in metadata) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment(id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='English', speaker=re.sub(r'-.*', r'', idx), text=metadata[idx].text.strip()) for idx in audio.recordings) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{part}.json') audio.to_json(output_dir / f'recordings_{part}.json') manifests[part] = {'recordings': audio, 'supervisions': supervision} return manifests
def get_file_info(p): if isinstance(p, Path): p = p.as_posix() signal_info, _ = torchaudio.info(p) return signal_info
def test_get_info(self): input_path = os.path.join(self.test_dirpath, 'assets', 'sinewave.wav') info_expected = (1, 64000, 16000, 32) info_load = torchaudio.info(input_path) self.assertEqual(info_load, info_expected)
if args.segments is None: audio_list = load_wav_scp(args.wav_scp) else: audio_list = load_wav_segments(args.wav_scp, args.segments) count = 0 with open(args.out_ark, 'wb') as ark_fout, \ open(args.out_scp, 'w', encoding='utf8') as scp_fout: for item in audio_list: if len(item) == 2: key, wav_path = item waveform, sample_rate = torchaudio.load_wav(wav_path) else: assert len(item) == 4 key, wav_path, start, end = item sample_rate = torchaudio.info(wav_path).sample_rate frame_offset = int(start * sample_rate) num_frames = int((end - start) * sample_rate) waveform, sample_rate = torchaudio.load_wav( wav_path, frame_offset, num_frames) mat = kaldi.fbank(waveform, num_mel_bins=args.num_mel_bins, frame_length=args.frame_length, frame_shift=args.frame_shift, dither=args.dither, energy_floor=0.0, sample_frequency=sample_rate) mat = mat.detach().numpy() kaldi_io.write_ark_scp(key, mat, ark_fout, scp_fout) count += 1
def prepare_ami( data_dir: Pathlike, output_dir: Optional[Pathlike] = None, ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param data_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is ('train', 'dev', 'eval'), and the value is Dicts with keys 'audio' and 'supervisions'. """ data_dir = Path(data_dir) assert data_dir.is_dir(), f'No such directory: {data_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) anotation_lists = parse_ami_annotations(data_dir / 'annotations.gzip') # Create a mapping from a tuple of (session_id, channel) to the list of annotations. # This way we can map the supervisions to the right channels in a multi-channel recording. annotation_by_id_and_channel = { (filename.split('.')[0], int(filename[-5])): annotations for filename, annotations in anotation_lists.items() } wav_dir = data_dir / 'wav_db' audio_paths = wav_dir.rglob('*.wav') # Group together multiple channels from the same session. # We will use that to create a Recording with multiple sources (channels). from cytoolz import groupby channel_wavs = groupby(lambda p: p.parts[-3], audio_paths) manifests = defaultdict(dict) for part in dataset_parts: # Audio recordings = [] for session_name, channel_paths in channel_wavs.items(): if session_name not in dataset_parts[part]: continue audio_info = torchaudio.info(str(channel_paths[0]))[0] recordings.append(Recording( id=session_name, sources=[ AudioSource( type='file', channels=[idx], source=str(audio_path) ) for idx, audio_path in enumerate(sorted(channel_paths)) ], sampling_rate=int(audio_info.rate), num_samples=audio_info.length, duration=audio_info.length / audio_info.rate, )) audio = RecordingSet.from_recordings(recordings) # Supervisions segments_by_pause = [] for recording in audio: for source in recording.sources: # In AMI "source.channels" will always be a one-element list channel, = source.channels anotation = annotation_by_id_and_channel.get((recording.id, channel)) if anotation is None: logging.warning(f'No annotation found for recording "{recording.id}" channel {channel} ' f'(file {source.source})') continue for seg_idx, seg_info in enumerate(anotation): for subseg_idx, subseg_info in enumerate(seg_info): duration = subseg_info.end_time - subseg_info.begin_time if duration > 0: segments_by_pause.append(SupervisionSegment( id=f'{recording.id}-{seg_idx}-{subseg_idx}', recording_id=recording.id, start=subseg_info.begin_time, duration=duration, channel=channel, language='English', speaker=subseg_info.speaker, gender=subseg_info.gender, text=subseg_info.text )) supervision = SupervisionSet.from_segments(segments_by_pause) if output_dir is not None: audio.to_json(output_dir / f'recordings_{part}.json') supervision.to_json(output_dir / f'supervisions_{part}.json') manifests[part] = { 'recordings': audio, 'supervisions': supervision } return manifests
def duration(self): if self._sample_rate is not None: return len(self.data_signal) / self.sample_rate else: si, ei = torchaudio.info(str(self.path)) return si.length / si.rate
def sample_rate(self): if not hasattr(self, '_sample_rate') or self._sample_rate is None: # Gets metadata from an audio file without loading the signal. si, ei = torchaudio.info(str(self.path)) self._sample_rate = si.rate return self._sample_rate
def prepare_ljspeech( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Union[RecordingSet, SupervisionSet]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: The RecordingSet and SupervisionSet with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) # Generate a mapping: utt_id -> (audio_path, audio_info, text) metadata_csv_path = corpus_dir / 'metadata.csv' assert metadata_csv_path.is_file(), f'No such file: {metadata_csv_path}' metadata = {} with open(metadata_csv_path) as f: for line in f: idx, text, _ = line.split('|') audio_path = corpus_dir / 'wavs' / f'{idx}.wav' if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = LJSpeechMetaData(audio_path=audio_path, audio_info=info[0], text=text) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording( id=idx, sources=[ AudioSource( type='file', channels=[0], source=str(metadata[idx].audio_path) ) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate) ) for idx in metadata ) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='English', gender='female', text=metadata[idx].text ) for idx in audio.recordings ) if output_dir is not None: supervision.to_json(output_dir / 'supervisions.json') audio.to_json(output_dir / 'audio.json') return {'audio': audio, 'supervisions': supervision}
def duration(self): if (self.sig is not None): return self.nsamples / self.sr else: si, ei = torchaudio.info(str(self.path)) return si.length / si.rate
def prepare_mobvoihotwords( corpus_dir: Pathlike, output_dir: Optional[Pathlike] = None ) -> Dict[str, Dict[str, Union[RecordingSet, SupervisionSet]]]: """ Returns the manifests which consist of the Recordings and Supervisions :param corpus_dir: Pathlike, the path of the data dir. :param output_dir: Pathlike, the path where to write the manifests. :return: a Dict whose key is the dataset part, and the value is Dicts with the keys 'audio' and 'supervisions'. """ corpus_dir = Path(corpus_dir) assert corpus_dir.is_dir(), f'No such directory: {corpus_dir}' if output_dir is not None: output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) manifests = defaultdict(dict) dataset_parts = ['train', 'dev', 'test'] for part in dataset_parts: # Generate a mapping: utt_id -> (audio_path, audio_info, speaker, text) metadata = {} for prefix in ['p_', 'n_']: prefixed_part = prefix + part json_path = corpus_dir / 'mobvoi_hotword_dataset_resources' / f'{prefixed_part}.json' with open(json_path, 'r', encoding='utf-8') as f: json_data = json.load(f) for entry in json_data: idx = entry['utt_id'] speaker = idx if entry['speaker_id'] is None else entry['speaker_id'] audio_path = corpus_dir / 'mobvoi_hotword_dataset' / f'{idx}.wav' text = 'FREETEXT' if entry['keyword_id'] == 0: text = 'HiXiaowen' elif entry['keyword_id'] == 1: text = 'NihaoWenwen' else: assert entry['keyword_id'] == -1 if audio_path.is_file(): # info[0]: info of the raw audio (e.g. channel number, sample rate, duration ... ) # info[1]: info about the encoding (e.g. FLAC/ALAW/ULAW ...) info = torchaudio.info(str(audio_path)) metadata[idx] = MobvoiHotwordsMetaData( audio_path=audio_path, audio_info=info[0], speaker=speaker, text=text ) else: logging.warning(f'No such file: {audio_path}') # Audio audio = RecordingSet.from_recordings( Recording( id=idx, sources=[ AudioSource( type='file', channels=[0], source=str(metadata[idx].audio_path) ) ], sampling_rate=int(metadata[idx].audio_info.rate), num_samples=metadata[idx].audio_info.length, duration=(metadata[idx].audio_info.length / metadata[idx].audio_info.rate) ) for idx in metadata ) # Supervision supervision = SupervisionSet.from_segments( SupervisionSegment( id=idx, recording_id=idx, start=0.0, duration=audio.recordings[idx].duration, channel=0, language='Chinese', speaker=metadata[idx].speaker, text=metadata[idx].text.strip() ) for idx in audio.recordings ) if output_dir is not None: supervision.to_json(output_dir / f'supervisions_{part}.json') audio.to_json(output_dir / f'recordings_{part}.json') manifests[part] = { 'recordings': audio, 'supervisions': supervision } return manifests
###################################################################### # Audio I/O # ========= # # torchaudio integrates ``libsox`` and provides a rich set of audio I/O. # ###################################################################### # Quering audio metadata # ---------------------- # # ``torchaudio.info`` function fetches metadata of audio. You can provide # a path-like object or file-like object. # metadata = torchaudio.info(SAMPLE_WAV_PATH) print_metadata(metadata, src=SAMPLE_WAV_PATH) ###################################################################### # Where # # - ``sample_rate`` is the sampling rate of the audio # - ``num_channels`` is the number of channels # - ``num_frames`` is the number of frames per channel # - ``bits_per_sample`` is bit depth # - ``encoding`` is the sample coding format # # The values ``encoding`` can take are one of the following # # - ``"PCM_S"``: Signed integer linear PCM # - ``"PCM_U"``: Unsigned integer linear PCM
def extractLength(couple): speaker, locPath = couple info = torchaudio.info(str(locPath))[0] return info.length
def create_csv( orig_tsv_file, csv_file, data_folder, accented_letters=False, language="en", ): """ Creates the csv file given a list of wav files. Arguments --------- orig_tsv_file : str Path to the Common Voice tsv file (standard file). data_folder : str Path of the CommonVoice dataset. accented_letters : bool, optional Defines if accented letters will be kept as individual letters or transformed to the closest non-accented letters. Returns ------- None """ # Check if the given files exists if not os.path.isfile(orig_tsv_file): msg = "\t%s doesn't exist, verify your dataset!" % (orig_tsv_file) logger.info(msg) raise FileNotFoundError(msg) # We load and skip the header loaded_csv = open(orig_tsv_file, "r").readlines()[1:] nb_samples = str(len(loaded_csv)) msg = "Preparing CSV files for %s samples ..." % (str(nb_samples)) logger.info(msg) # Adding some Prints msg = "Creating csv lists in %s ..." % (csv_file) logger.info(msg) csv_lines = [["ID", "duration", "wav", "spk_id", "wrd"]] # Start processing lines total_duration = 0.0 for line in tzip(loaded_csv): line = line[0] # Path is at indice 1 in Common Voice tsv files. And .mp3 files # are located in datasets/lang/clips/ mp3_path = data_folder + "/clips/" + line.split("\t")[1] file_name = mp3_path.split(".")[-2].split("/")[-1] spk_id = line.split("\t")[0] snt_id = file_name # Reading the signal (to retrieve duration in seconds) if os.path.isfile(mp3_path): info = torchaudio.info(mp3_path) else: msg = "\tError loading: %s" % (str(len(file_name))) logger.info(msg) continue duration = info.num_frames / info.sample_rate total_duration += duration # Getting transcript words = line.split("\t")[2] # !! Language specific cleaning !! # Important: feel free to specify the text normalization # corresponding to your alphabet. if language in ["en", "fr", "it", "rw"]: words = re.sub("[^'A-Za-z0-9À-ÖØ-öø-ÿЀ-ӿ]+", " ", words).upper() elif language == "ar": HAMZA = "\u0621" ALEF_MADDA = "\u0622" ALEF_HAMZA_ABOVE = "\u0623" letters = ("ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإئ" + HAMZA + ALEF_MADDA + ALEF_HAMZA_ABOVE) words = re.sub("[^" + letters + "]+", " ", words).upper() elif language == "ga-IE": # Irish lower() is complicated, but upper() is nondeterministic, so use lowercase def pfxuc(a): return len(a) >= 2 and a[0] in "tn" and a[1] in "AEIOUÁÉÍÓÚ" def galc(w): return w.lower( ) if not pfxuc(w) else w[0] + "-" + w[1:].lower() words = re.sub("[^-A-Za-z'ÁÉÍÓÚáéíóú]+", " ", words) words = " ".join(map(galc, words.split(" "))) # Remove accents if specified if not accented_letters: nfkd_form = unicodedata.normalize("NFKD", words) words = "".join( [c for c in nfkd_form if not unicodedata.combining(c)]) words = words.replace("'", " ") # Remove multiple spaces words = re.sub(" +", " ", words) # Remove spaces at the beginning and the end of the sentence words = words.lstrip().rstrip() # Getting chars chars = words.replace(" ", "_") chars = " ".join([char for char in chars][:]) # Remove too short sentences (or empty): if len(words) < 3: continue # Composition of the csv_line csv_line = [snt_id, str(duration), mp3_path, spk_id, str(words)] # Adding this line to the csv_lines list csv_lines.append(csv_line) # Writing the csv lines with open(csv_file, mode="w", encoding="utf-8") as csv_f: csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in csv_lines: csv_writer.writerow(line) # Final prints msg = "%s successfully created!" % (csv_file) logger.info(msg) msg = "Number of samples: %s " % (str(len(loaded_csv))) logger.info(msg) msg = "Total duration: %s Hours" % (str(round(total_duration / 3600, 2))) logger.info(msg)
def extractLength(couple): speaker, locPath = couple info = torchaudio.info(str(locPath)) return info.num_frames
def calc_waveform_length(path: str, sample_rate: int) -> int: info, _ = ta.info(path) return math.ceil(info.length * sample_rate / info.rate / info.channels)
def create_csv(wav_list, csv_file): """ Creates the csv file given a list of wav files. Arguments --------- wav_list : list of str The list of wav files. csv_file : str The path of the output json file """ # Adding some Prints msg = f"Creating csv lists in {csv_file} ..." logger.info(msg) csv_lines = [] # Start processing lines total_duration = 0.0 # Starting index idx = 0 for wav_file in tzip(wav_list): wav_file = wav_file[0] path_parts = wav_file.split(os.path.sep) file_name, wav_format = os.path.splitext(path_parts[-1]) # Peeking at the signal (to retrieve duration in seconds) if os.path.isfile(wav_file): info = torchaudio.info(wav_file) else: msg = "\tError loading: %s" % (str(len(file_name))) logger.info(msg) continue audio_duration = info.num_frames / info.sample_rate total_duration += audio_duration # Actual name of the language language = path_parts[-4] # Create a row with whole utterences csv_line = [ idx, # ID wav_file, # File name wav_format, # File format str(info.num_frames / info.sample_rate), # Duration (sec) language, # Language ] # Adding this line to the csv_lines list csv_lines.append(csv_line) # Increment index idx += 1 # CSV column titles csv_header = ["ID", "wav", "wav_format", "duration", "language"] # Add titles to the list at indexx 0 csv_lines.insert(0, csv_header) # Writing the csv lines with open(csv_file, mode="w", encoding="utf-8") as csv_f: csv_writer = csv.writer(csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL) for line in csv_lines: csv_writer.writerow(line) # Final prints msg = f"{csv_file} sucessfully created!" logger.info(msg) msg = f"Number of samples: {len(wav_list)}." logger.info(msg) msg = f"Total duration: {round(total_duration / 3600, 2)} hours." logger.info(msg)
def test_save(self): # load signal x, sr = load(self.test_filepath) # check save new_filepath = os.path.join(self.test_dirpath, "test.wav") save(new_filepath, x, sr) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # check automatic normalization x /= 1 << 31 save(new_filepath, x, sr) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # test save 1d tensor x = x[:, 0] # get mono signal x.squeeze_() # remove channel dim save(new_filepath, x, sr) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # don't allow invalid sizes as inputs with self.assertRaises(ValueError): x.unsqueeze_(0) # N x L not L x N save(new_filepath, x, sr) with self.assertRaises(ValueError): x.squeeze_() x.unsqueeze_(1) x.unsqueeze_(0) # 1 x L x 1 save(new_filepath, x, sr) # automatically convert sr from floating point to int x.squeeze_(0) save(new_filepath, x, float(sr)) self.assertTrue(os.path.isfile(new_filepath)) os.unlink(new_filepath) # don't save to folders that don't exist with self.assertRaises(OSError): new_filepath = os.path.join(self.test_dirpath, "no-path", "test.wav") save(new_filepath, x, sr) # save created file sinewave_filepath = os.path.join(self.test_dirpath, "assets", "sinewave.wav") sr = 16000 freq = 440 volume = 0.3 y = (torch.cos(2 * math.pi * torch.arange(0, 4 * sr).float() * freq / sr)) y.unsqueeze_(1) # y is between -1 and 1, so must scale y = (y * volume * 2**31).long() save(sinewave_filepath, y, sr) self.assertTrue(os.path.isfile(sinewave_filepath)) # test precision new_filepath = os.path.join(self.test_dirpath, "test.wav") si, ei = torchaudio.info(sinewave_filepath) save(new_filepath, y, sr, precision=16) si16, ei16 = torchaudio.info(new_filepath) self.assertEqual(si.precision, 32) self.assertEqual(si16.precision, 16) os.unlink(new_filepath)
def __getitem__(self, index) -> AudioAndLabels: track, audio_paths, tsv_path = self.file_list[index] audio = None if index < self.max_files_in_memory: audio = self.audios[index] # The first time the audio needs to be loaded in memory if audio is None: audio = load_audio(audio_paths, normalize=False) self.audios[index] = audio labels: Labels = self.labels[index] # The first the labels needs to be loaded in memory if labels is None: labels = self.load_labels(audio_paths, tsv_path) self.labels[index] = labels audio_length = torchaudio.info(audio_paths[0]).num_frames start_frame = None end_frame = None if self.sequence_length is not None: possible_start_interval = audio_length - self.sequence_length if self.reproducable_load_sequences: step_begin = ( int(hashlib.sha256("".join(audio_paths).encode("utf-8")).hexdigest(), 16) % possible_start_interval ) else: step_begin = self.random.randint(possible_start_interval) step_begin //= HOP_LENGTH n_steps = self.sequence_length // HOP_LENGTH step_end = step_begin + n_steps begin = step_begin * HOP_LENGTH end = begin + self.sequence_length num_frames = end - begin if audio is None: audio = load_audio(audio_paths, frame_offset=begin, num_frames=num_frames, normalize=False).to( self.device ) else: audio = audio[begin:end].to(self.device) label = labels.label[step_begin:step_end, :].to(self.device) velocity = labels.velocity[step_begin:step_end, :].to(self.device) start_frame = begin end_frame = end else: if audio is None: audio = load_audio(audio_paths, normalize=False).to(self.device) else: audio = audio.to(self.device) label = labels.label.to(self.device) velocity = labels.velocity.to(self.device).float() start_frame = 0 end_frame = audio_length onset = (label == 4).float() frame = (label > 1).float() offset = ((label == 1) + (label == 2)).float() velocity = velocity.float().div_(128.0) return AudioAndLabels( track=track, start_time=start_frame/SAMPLE_RATE, end_time=end_frame/SAMPLE_RATE, audio=audio, annotation=MusicAnnotation(onset=onset, offset=offset, frame=frame, velocity=velocity), )
def audio_pipeline( mix_wav, ): # this is dummy --> it means one epoch will be same as without dynamic mixing """ This audio pipeline defines the compute graph for dynamic mixing """ speakers = np.random.choice(spk_list, hparams["num_spks"], replace=False, p=spk_weights) if hparams["use_wham_noise"]: noise_file = np.random.choice(noise_files, 1, replace=False) noise, fs_read = torchaudio.load(noise_file[0]) noise = noise.squeeze() # select two speakers randomly sources = [] spk_files = [ np.random.choice(spk_hashtable[spk], 1, False)[0] for spk in speakers ] minlen = min( *[torchaudio.info(x).num_frames for x in spk_files], hparams["training_signal_len"], ) meter = pyloudnorm.Meter(hparams["sample_rate"]) MAX_AMP = 0.9 MIN_LOUDNESS = -33 MAX_LOUDNESS = -25 def normalize(signal, is_noise=False): """ This function normalizes the audio signals for loudness """ with warnings.catch_warnings(): warnings.simplefilter("ignore") c_loudness = meter.integrated_loudness(signal) if is_noise: target_loudness = random.uniform(MIN_LOUDNESS - 5, MAX_LOUDNESS - 5) else: target_loudness = random.uniform(MIN_LOUDNESS, MAX_LOUDNESS) signal = pyloudnorm.normalize.loudness(signal, c_loudness, target_loudness) # check for clipping if np.max(np.abs(signal)) >= 1: signal = signal * MAX_AMP / np.max(np.abs(signal)) return torch.from_numpy(signal) for i, spk_file in enumerate(spk_files): # select random offset length = torchaudio.info(spk_file).num_frames start = 0 stop = length if length > minlen: # take a random window start = np.random.randint(0, length - minlen) stop = start + minlen tmp, fs_read = torchaudio.load( spk_file, frame_offset=start, num_frames=stop - start, ) tmp = tmp[0].numpy() tmp = normalize(tmp) sources.append(tmp) sources = torch.stack(sources) mixture = torch.sum(sources, 0) if hparams["use_wham_noise"]: len_noise = len(noise) len_mix = len(mixture) min_len = min(len_noise, len_mix) noise = normalize(noise.numpy(), is_noise=True) mixture = mixture[:min_len] + noise[:min_len] # check for clipping max_amp_insig = mixture.abs().max().item() if max_amp_insig > MAX_AMP: weight = MAX_AMP / max_amp_insig else: weight = 1 sources = weight * sources mixture = weight * mixture yield mixture for i in range(hparams["num_spks"]): yield sources[i] # If the number of speakers is 2, yield None for the 3rd speaker if hparams["num_spks"] == 2: yield None if hparams["use_wham_noise"]: noise = noise * weight yield noise else: yield None
def __getitem__(self, index): """ :param index: :return: """ # Read start and stop and convert to time in seconds if self.idmap.start[index] is None: start = 0 else: start = int(self.idmap.start[index] * 0.01 * self.sample_rate) if self.idmap.stop[index] is None: #speech, speech_fs = get_sample(f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}", resample=self.sample_rate) nfo = torchaudio.info( f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}" ) speech, speech_fs = torchaudio.load( f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}" ) if nfo.sample_rate != self.sample_rate: speech = torchaudio.transforms.Resample( nfo.sample_rate, self.sample_rate).forward(speech) duration = int(speech.shape[1] - start) start = 0 stop = speech.shape[1] else: # TODO Check if that code is still relevant with torchaudio.load() in case of sample_rate mismatch nfo = torchaudio.info( f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}" ) assert nfo.sample_rate == self.sample_rate conversion_rate = nfo.sample_rate // self.sample_rate start = start stop = (int(self.idmap.stop[index] * 0.01 * self.sample_rate) - start) # add this in case the segment is too short if stop - start <= self.min_duration * self.sample_rate: middle = start + (stop - start) // 2 start = max( 0, int(middle - (self.min_duration * self.sample_rate / 2))) duration = int(self.min_duration * self.sample_rate) speech, speech_fs = torchaudio.load( f"{self.data_path}/{self.idmap.rightids[index]}.{self.file_extension}", frame_offset=start * conversion_rate, num_frames=duration * conversion_rate) speech = torchaudio.transforms.Resample( nfo.sample_rate, self.sample_rate).forward(speech) speech += 10e-6 * torch.randn(speech.shape) if self.sliding_window: speech = speech.squeeze().unfold(0, self.window_len, self.window_shift) middle_points = numpy.arange( start + self.window_len / 2, start + duration - self.window_len / 2, self.window_shift) starts = middle_points - self.window_shift / 2 stops = middle_points + self.window_shift / 2 starts[0] = start start = starts stops[-1] = start + duration else: stop = start + duration if len(self.transformation.keys()) > 0: speech = data_augmentation(speech, speech_fs, self.transformation, self.transform_number, noise_df=self.noise_df, rir_df=self.rir_df) if self.backward: speech = torch.flip(speech, [0, 1]).squeeze() else: speech = speech.squeeze() return speech, self.idmap.leftids[index], self.idmap.rightids[ index], start, stop
def get_dataset_fast_api_version(file_location): files = [] siginfo, _ = torchaudio.info(file_location) length = siginfo.length // siginfo.channels files.append((file_location, length)) return Audioset(files, with_path=True, sample_rate=sample_rate)
import torchaudio import matplotlib.pyplot as plt from pathlib import Path # p = Path('Supercharger_Blockiergebuehr_Tesla_Fordert_Geld_V.mp3') # # print(p.exists()) filename = "../noise_cancellation/Supercharger_Blockiergebühr_Tesla_Fordert_Geld_V_25sec.mp3" print(torchaudio.info(filename)) waveform, sample_rate = torchaudio.load(filename) print("Shape of waveform: {}".format(waveform.size())) print("Sample rate of waveform: {}".format(sample_rate)) plt.figure() plt.plot(waveform.t().numpy()) plt.show() specgram = torchaudio.transforms.Spectrogram()(waveform) print("Shape of spectrogram: {}".format(specgram.size())) plt.figure() plt.imshow(specgram.log2()[0, :, :].numpy(), cmap='gray') plt.show() specgram = torchaudio.transforms.MelSpectrogram()(waveform) print("Shape of spectrogram: {}".format(specgram.size()))
def __init__(self, root_dir, subset="train", length=16384, preload=False, half=True, use_soundfile=False): """ Args: root_dir (str): Path to the root directory of the SignalTrain dataset. subset (str, optional): Pull data either from "train", "val", or "test" subsets. (Default: "train") length (int, optional): Number of samples in the returned examples. (Default: 40) preload (bool, optional): Read in all data into RAM during init. (Default: False) half (bool, optional): Store the float32 audio as float16. (Default: True) use_soundfile (bool, optional): Use the soundfile library to load instead of torchaudio. (Default: False) """ self.root_dir = root_dir self.subset = subset self.length = length self.preload = preload self.half = half self.use_soundfile = use_soundfile # get all the target files files in the directory first self.target_files = glob.glob( os.path.join(self.root_dir, self.subset.capitalize(), "target_*.wav")) self.input_files = glob.glob( os.path.join(self.root_dir, self.subset.capitalize(), "input_*.wav")) self.examples = [] self.hours = 0 # total number of hours of data in the subset # ensure that the sets are ordered correctlty self.target_files.sort() self.input_files.sort() # get the parameters self.params = [(float(f.split("__")[1].replace(".wav", "")), float(f.split("__")[2].replace(".wav", ""))) for f in self.target_files] # loop over files to count total length for idx, (tfile, ifile, params) in enumerate( zip(self.target_files, self.input_files, self.params)): ifile_id = int(os.path.basename(ifile).split("_")[1]) tfile_id = int(os.path.basename(tfile).split("_")[1]) if ifile_id != tfile_id: raise RuntimeError( f"Found non-matching file ids: {ifile_id} != {tfile_id}! Check dataset." ) md = torchaudio.info(tfile) self.hours += (md.num_frames / md.sample_rate) / 3600 num_frames = md.num_frames if self.preload: sys.stdout.write( f"* Pre-loading... {idx+1:3d}/{len(self.target_files):3d} ...\r" ) sys.stdout.flush() input, sr = self.load(ifile) target, sr = self.load(tfile) num_frames = int(np.min([input.shape[-1], target.shape[-1]])) if input.shape[-1] != target.shape[-1]: print(os.path.basename(ifile), input.shape[-1], os.path.basename(tfile), target.shape[-1]) raise RuntimeError("Found potentially corrupt file!") if self.half: input = input.half() target = target.half() else: input = None target = None # create one entry for each patch for n in range((num_frames // self.length) - 1): offset = int(n * self.length) end = offset + self.length self.examples.append({ "idx": idx, "target_file": tfile, "input_file": ifile, "input_audio": input[:, offset:end] if input is not None else None, "target_audio": target[:, offset:end] if input is not None else None, "params": params, "offset": offset, "frames": num_frames }) # we then want to get the input files print( f"Located {len(self.examples)} examples totaling {self.hours:0.1f} hr in the {self.subset} subset." )