def create_database(database_path, transcription_realigned_path, chime6): logging.basicConfig(format='%(levelname)s: %(message)s', level=logging.INFO) datasets = dict() alias = dict() transcription_realigned_pathes = Dispatcher({ p.name: p for p in Path(transcription_realigned_path).glob('**/*.json') }) kaldi_transcriptions = dict() if chime6: set_length = set_length_chime6 else: set_length = set_length_chime5 for dataset in set_length.keys(): out_dict = get_dataset(database_path, dataset, transcription_realigned_pathes, kaldi_transcriptions, chime6) for session_id, v in out_dict.items(): datasets[session_id] = v alias[dataset] = list(out_dict.keys()) return {keys.DATASETS: datasets, 'alias': alias}
def write_keyed_text_file(text_file: Path, data_dict): """ Often used to write e.g. Kaldi `text`, `wav.scp` or `spk2utt`. Sorting is enforced here to avoid subsequent calls to fix_data_dir.sh For some file names, it tries to perform some kind of sanity check to match the Kaldi file standards. Args: text_file: Path with file in format: <utterance_id> <else> Returns: """ text_file = Path(text_file) data = [] for k, text in sorted(data_dict.items()): if isinstance(text, list): text = ' '.join(text) if text_file.name == 'utt2dur': try: text_number = float(text) except Exception: raise ValueError( f'The text "{text}" for {k} that should be written to ' f'{text_file} does not represent a number.') else: assert 0. < text_number < 1000., f'Strange duration: {k}: {text_number} s' elif text_file.name == 'spk2gender': text = Dispatcher( male='m', female='f', m='m', f='f', )[text] else: pass data.append(f'{k} {text}') text_file.write_text('\n'.join(data))
from collections import defaultdict from pb_chime5.mapping import Dispatcher dev_sess_ref_array_mapping = Dispatcher({ 'S02': ['U02', 'U03', 'U05'], 'S09': ['U01', 'U04', 'U06'] }) #TODO: check if still relevant # error_id_mapping = Dispatcher({ # 'nan_in_gcc_phat': [ # 'P27_S09_0217746-0218448', # 'P25_S09_0218350-0218590', # 'P27_S09_0218528-0218656', # 'P25_S09_0218645-0218785', # 'P28_S09_0228518-0228606', # 'P25_S09_0228530-0228730', # 'P26_S09_0228590-0228810', # 'P28_S09_0228724-0228822', # 'P25_S09_0228730-0229010', # 'P27_S09_0228832-0229004', # 'P26_S09_0236312-0236452', # 'P26_S09_0236558-0236688', # 'P26_S09_0274100-0274266', # 'P25_S09_0274135-0274260', # 'P26_S09_0286486-0286608', # 'P25_S09_0286555-0286720', # 'P25_S09_0439024-0439440', # 'P28_S09_0439248-0439372', # 'P25_S09_0468290-0468476',
def dump_audio( obj, path, *, sample_rate=16000, dtype=np.int16, start=None, normalize=True, format=None, ): """ If normalize is False and the dytpe is float, the values of obj should be in the range [-1, 1). Params: obj: Shape (channels, samples) or (samples,) path: sample_rate: dtype: start: normalize: >>> from pb_chime5.utils.process_caller import run_process >>> from pb_chime5.io import load_audio >>> a = np.array([1, 2, -4, 4], dtype=np.int16) >>> import io, os >>> # file = io.BytesIO() >>> file = Path('tmp_audio_data.wav') >>> dump_audio(a, file, normalize=False) >>> load_audio(file) * 2**15 array([ 1., 2., -4., 4.]) >>> print(run_process(f'file {file}').stdout) tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz <BLANKLINE> >>> dump_audio(a, file, normalize=True) >>> load_audio(file) array([ 0.24996948, 0.49996948, -0.99996948, 0.99996948]) >>> print(run_process(f'file {file}').stdout) tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz <BLANKLINE> >>> data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) / 32 >>> data array([0. , 0.03125, 0.0625 , 0.09375, 0.125 , 0.15625, 0.1875 , 0.21875, 0.25 , 0.28125]) >>> dump_audio(data, file, normalize=False) >>> load_audio(file) array([0. , 0.03125, 0.0625 , 0.09375, 0.125 , 0.15625, 0.1875 , 0.21875, 0.25 , 0.28125]) >>> print(run_process(f'file {file}').stdout) tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz <BLANKLINE> >>> dump_audio(np.array([16, 24]) / 32, file, normalize=False, start=1) >>> load_audio(file) array([0. , 0.5 , 0.75 , 0.09375, 0.125 , 0.15625, 0.1875 , 0.21875, 0.25 , 0.28125]) >>> print(run_process(f'file {file}').stdout) tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz <BLANKLINE> >>> dump_audio(np.array([16, 24, 24, 24]) / 32, file, normalize=False, start=9) >>> load_audio(file) array([0. , 0.5 , 0.75 , 0.09375, 0.125 , 0.15625, 0.1875 , 0.21875, 0.25 , 0.5 , 0.75 , 0.75 , 0.75 ]) >>> load_audio(file).shape (13,) >>> dump_audio(np.array([16, 24, 24, 24]) / 32, file, normalize=False, start=20) >>> load_audio(file) array([0. , 0.5 , 0.75 , 0.09375, 0.125 , 0.15625, 0.1875 , 0.21875, 0.25 , 0.5 , 0.75 , 0.75 , 0.75 , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5 , 0.75 , 0.75 , 0.75 ]) >>> load_audio(file).shape (24,) >>> print(run_process(f'file {file}').stdout) tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz <BLANKLINE> >>> os.remove('tmp_audio_data.wav') >>> dump_audio(np.array([16, 24, 24, 24]) / 32, file, normalize=False, start=20) >>> load_audio(file) array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0.5 , 0.75, 0.75, 0.75]) >>> load_audio(file).shape (24,) >>> print(run_process(f'file {file}').stdout) tmp_audio_data.wav: RIFF (little-endian) data, WAVE audio, Microsoft PCM, 16 bit, mono 16000 Hz <BLANKLINE> >>> data = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) / 32 >>> data array([0. , 0.03125, 0.0625 , 0.09375, 0.125 , 0.15625, 0.1875 , 0.21875, 0.25 , 0.28125]) >>> dump_audio(data, file, normalize=False, dtype=None) >>> load_audio(file) array([0. , 0.03125, 0.0625 , 0.09375, 0.125 , 0.15625, 0.1875 , 0.21875, 0.25 , 0.28125]) >>> print(run_process(f'soxi {file}').stdout) <BLANKLINE> Input File : 'tmp_audio_data.wav' Channels : 1 Sample Rate : 16000 Precision : 53-bit Duration : 00:00:00.00 = 10 samples ~ 0.046875 CDDA sectors File Size : 160 Bit Rate : 2.05M Sample Encoding: 64-bit Floating Point PCM <BLANKLINE> <BLANKLINE> >>> dump_audio(data.astype(np.float32), file, normalize=False, dtype=None) >>> load_audio(file, dtype=None) array([0. , 0.03125, 0.0625 , 0.09375, 0.125 , 0.15625, 0.1875 , 0.21875, 0.25 , 0.28125], dtype=float32) >>> print(run_process(f'soxi {file}').stdout) <BLANKLINE> Input File : 'tmp_audio_data.wav' Channels : 1 Sample Rate : 16000 Precision : 24-bit Duration : 00:00:00.00 = 10 samples ~ 0.046875 CDDA sectors File Size : 120 Bit Rate : 1.54M Sample Encoding: 32-bit Floating Point PCM <BLANKLINE> <BLANKLINE> """ path = normalize_path(path, as_str=True) obj = np.asarray(obj) if normalize: if not obj.dtype.kind in ['f', 'i']: raise TypeError( 'Only float and int is currently supported with normalize. ' f'Got dtype {obj.dtype}') # Normalization can change the type (e.g. int to float). # When saving as float, normalize is a bad idea. # The normalization is adjusted for int16 assert dtype == np.int16, ( 'Currently is only normalize allowed for dtype == np.int16' f'and not for dtype == {dtype}') # Correction, because the allowed values are in the range [-1, 1). # => "1" is not a vaild value correction = (2**15 - 1) / (2**15) obj = obj * (correction / np.amax(np.abs(obj))) # ToDo: better exception when path is file descriptor if start is None or not Path(path).exists(): if obj.ndim == 1: channels = 1 else: channels = obj.shape[0] sf_args = dict( mode='w', channels=channels, samplerate=sample_rate, ) else: sf_args = dict(mode='r+') sf_args['format'] = format dtype_map = Dispatcher({ np.int16: 'PCM_16', np.dtype('int16'): 'PCM_16', np.int32: 'PCM_32', np.dtype('int32'): 'PCM_32', np.float32: 'FLOAT', np.dtype('float32'): 'FLOAT', np.float64: 'DOUBLE', np.dtype('float64'): 'DOUBLE', }) if dtype in [np.int16]: pass elif dtype in [np.float32, np.float64, np.int32]: sf_args['subtype'] = dtype_map[dtype] elif dtype is None: sf_args['subtype'] = dtype_map[obj.dtype] else: raise TypeError(dtype) # soundfile.write() with soundfile.SoundFile(path, **sf_args) as f: if start is not None: f.seek(start) f.write(obj.T) return
def get_activity( iterator, *, perspective, garbage_class, dtype=np.bool, non_sil_alignment_fn=None, debug=False, use_ArrayIntervall=False, ): """ perspective: Example: 'global_worn' -- global perspective for worn ('P') 'worn' -- return perspective for each speaker ('P01', ...) 'array' -- return perspective for each array ('U01', ...) garbage_class: True, False, None True: garbage_class is always one False: garbage_class is always zero None: the number of classes is 4 and not 5 non_sil_alignment_fn: None or a function with the signature: value = non_sil_alignment_fn(ex, perspective_mic_array) where ex is one example in iterator perspective_mic_array is in ['U01', ..., 'P01', ..., 'P'] value is a 1d array indicating if at a sample the source is active or not use_ArrayIntervall: ArrayIntervall is a special datatype to reduce memory usage returns: dict[session_id][mic_perspective][speaker_id] = array(dtype=bool) session_id e.g.: 'S02', ... mic_perspective e.g.: 'P', 'P05', 'U01', ... speaker_id e.g.: 'P05', ... >>> from pb_chime5.database.chime5 import Chime5 >>> import textwrap >>> db = Chime5() >>> def display_activity(activity): ... print(tuple(activity.keys())) ... print(' '*2, tuple(activity['S02'].keys())) ... print(' '*4, tuple(activity['S02']['P'].keys())) ... print(' '*6, activity['S02']['P']['P05']) ... print(' '*6, activity['S02']['P']['Noise']) >>> def display_activity(activity, indent=0): ... indent_print = lambda x: print(textwrap.indent(str(x), ' '*indent)) ... if isinstance(activity, dict): ... for i, (k, v) in enumerate(activity.items()): ... if i == 0 or k in ['Noise']: ... indent_print(f'{k}:') ... display_activity(v, indent=indent+2) ... else: ... indent_print(f'{k}: ...') ... else: ... indent_print(activity) >>> activity = get_activity(db.get_datasets('S02'), perspective='global_worn', garbage_class=True) >>> display_activity(activity) S02: P: P05: [False False False ... False False False] P06: ... P07: ... P08: ... Noise: [ True True True ... True True True] >>> activity = get_activity(db.get_datasets('S02'), perspective='worn', garbage_class=False) >>> display_activity(activity) S02: P05: P05: [False False False ... False False False] P06: ... P07: ... P08: ... Noise: [False False False ... False False False] P06: ... P07: ... P08: ... >>> activity = get_activity(db.get_datasets('S02'), perspective='array', garbage_class=None) >>> display_activity(activity) S02: U01: P05: [False False False ... False False False] P06: ... P07: ... P08: ... U02: ... U03: ... U04: ... U05: ... U06: ... """ dict_it_S = iterator.groupby(lambda ex: ex['session_id']) # Dispatcher is a dict with better KeyErrors all_acitivity = Dispatcher() for session_id, it_S in dict_it_S.items(): if perspective == 'worn': perspective_tmp = mapping.session_to_speakers[session_id] elif perspective == 'global_worn': perspective_tmp = ['P'] # Always from target speaker elif perspective == 'array': # The mapping considers missing arrays perspective_tmp = mapping.session_to_arrays[session_id] else: perspective_tmp = perspective if not isinstance(perspective_tmp, (tuple, list)): perspective_tmp = [ perspective_tmp, ] speaker_ids = mapping.session_to_speakers[session_id] if use_ArrayIntervall: assert dtype == np.bool, dtype zeros = ArrayIntervall def ones(shape): arr = zeros(shape=shape) arr[:] = 1 return arr else: import functools zeros = functools.partial(np.zeros, dtype=dtype) ones = functools.partial(np.ones, dtype=dtype) all_acitivity[session_id] = Dispatcher({ p: Dispatcher({ s: zeros(shape=[ mapping.session_array_to_num_samples[f'{session_id}_{p}'] ]) # s: ArrayIntervall(shape=[num_samples]) for s in speaker_ids }) for p in perspective_tmp }) if garbage_class is True: for p in perspective_tmp: num_samples = mapping.session_array_to_num_samples[ f'{session_id}_{p}'] all_acitivity[session_id][p]['Noise'] = ones( shape=[num_samples], ) elif garbage_class is False: for p in perspective_tmp: num_samples = mapping.session_array_to_num_samples[ f'{session_id}_{p}'] all_acitivity[session_id][p]['Noise'] = zeros( shape=[num_samples]) elif garbage_class is None: pass elif isinstance(garbage_class, int) and garbage_class > 0: for noise_idx in range(garbage_class): for p in perspective_tmp: num_samples = mapping.session_array_to_num_samples[ f'{session_id}_{p}'] all_acitivity[session_id][p][f'Noise{noise_idx}'] = ones( shape=[num_samples]) else: raise ValueError(garbage_class) missing_count = 0 for ex in it_S: for pers in perspective_tmp: if ex['transcription'] == '[redacted]': continue target_speaker = ex['speaker_id'] # example_id = ex['example_id'] if pers == 'P': perspective_mic_array = target_speaker else: perspective_mic_array = pers if perspective_mic_array.startswith('P'): start = ex['start']['worn'][perspective_mic_array] end = ex['end']['worn'][perspective_mic_array] else: if not perspective_mic_array in ex['audio_path'][ 'observation']: continue start = ex['start']['observation'][perspective_mic_array] end = ex['end']['observation'][perspective_mic_array] if non_sil_alignment_fn is None: value = 1 else: value = non_sil_alignment_fn(ex, perspective_mic_array) if value is 1: missing_count += 1 if debug: all_acitivity[session_id][pers][target_speaker][ start:end] += value else: all_acitivity[session_id][pers][target_speaker][ start:end] = value if missing_count > len(it_S) // 2: raise RuntimeError( f'Something went wrong.\n' f'Expected {len(it_S) * len(perspective_tmp)} times a ' f'finetuned annotation for session {session_id}, but ' f'{missing_count} times they are missing.\n' f'Expect that at least {len(it_S) // 2} finetuned annotations ' f'are available, when non_sil_alignment_fn is given.\n' f'Otherwise assume something went wrong.') del it_S return all_acitivity
def get_phone_alignment( ali_path, use_kaldi_id=False, unique_per_utt=True, channel_preference=None, ): """ use_kaldi_id: Use a unique id per utterance or tha kaldi id (i.e. array dependent) unique_per_utt: Return one per utterance. When multiple kaldi ids are available use channel_preference. channel_preference: None or list of channels. Example channel_preference = ['R', 'L'] - assert any alignment has a left channel and any alignment has a right channel. (Note any not all) - If an example has a left and right channel, select the right. >>> # np.set_string_function(lambda a: f'array(shape={a.shape}, dtype={a.dtype})') >>> np.set_printoptions(threshold=50, edgeitems=30) >>> from IPython.lib.pretty import pprint >>> p = Path('/net/vol/jenkins/kaldi/2018-03-21_08-33-34_eba50e4420cfc536b68ca7144fac3cd29033adbb/egs/chime5/s5/exp/tri3_all_dev_worn_ali') >>> # p = ('~/net/vol/jenkins/kaldi/2018-03-21_08-33-34_eba50e4420cfc536b68ca7144fac3cd29033adbb/egs/chime5/s5/exp/tri3_all_dev_worn_ali', '~/net/vol/jenkins/kaldi/2018-03-21_08-33-34_eba50e4420cfc536b68ca7144fac3cd29033adbb/egs/chime5/s5/exp/tri3_all_dev_worn_ali') >>> alignment = get_phone_alignment(p) >>> pprint(alignment['P06_S02_0060700-0061058']) # doctest: +ELLIPSIS array(['sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'd_B', 'd_B', 'd_B', 'd_B', 'd_B', 'd_B', 'ih_I', 'ih_I', 'ih_I', 'z_E', 'z_E', 'z_E', ..., 'ay_I', 'ay_I', 'ay_I', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil'], dtype='<U4') >>> pprint(alignment['P25_S09_0121800-0122035']) # doctest: +ELLIPSIS array(['sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'ay_B', 'ay_B', 'ay_B', 'ay_B', 'ay_B', 'm_E', 'm_E', 'm_E', 'g_B', 'g_B', 'g_B', 'aa_I', 'aa_I', 'aa_I', 'aa_I', 'n_I', 'n_I', 'n_I', 'ah_E', 'ah_E', ..., 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'n_E', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil'], dtype='<U4') >>> non_sil_alignment = {k: v != 'sil' for k, v in alignment.items()} >>> pprint(dict(list(non_sil_alignment.items())[:3])) # doctest: +ELLIPSIS {'P05_S02_0004060-0004382': array([ True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, ..., True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]), 'P05_S02_0007011-0007297': array([False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, ..., False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False]), 'P05_S02_0007437-0007908': array([False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, ..., True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, False, False])} # >>> p = '/net/vol/jenkins/kaldi/2018-03-21_08-33-34_eba50e4420cfc536b68ca7144fac3cd29033adbb/egs/chime5/s5/exp/tri3_cleaned_ali_train_worn_u100k_cleaned_sp' # >>> alignment = get_phone_alignment(p) # >>> pprint(dict(list(non_sil_alignment.items())[:3])) # doctest: +ELLIPSIS # >>> print(len(alignment)) >>> ali_path = ( ... '/net/vol/jensheit/kaldi/egs/chime5/inear_bss_cacgmm_v3/finetune_0/kaldi/exp/tri3_worn_bss_stereo_train_worn_bss_stereo_ali/', ... '/net/vol/jensheit/kaldi/egs/chime5/inear_bss_cacgmm_v3/finetune_0/kaldi/exp/tri3_worn_bss_stereo_dev_worn_bss_stereo_ali/', ... ) # slow because of train >>> alignment = get_phone_alignment(ali_path, channel_preference=['R', 'L']) >>> pprint(alignment['P06_S02_0060700-0061058']) # doctest: +ELLIPSIS array(['sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'd_B', 'd_B', 'd_B', 'd_B', 'ih_I', 'ih_I', 'ih_I', 'z_E', 'z_E', ..., 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 't_E', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil', 'sil'], dtype='<U4') """ import pb_chime5.kaldi if isinstance(ali_path, (tuple, list)): alignments_list = [ get_phone_alignment( ap, channel_preference=channel_preference, use_kaldi_id=use_kaldi_id, ) for ap in ali_path ] total_len = sum([len(a) for a in alignments_list]) alignments = { k: v for a in alignments_list for k, v in a.items() } assert len(alignments) == total_len return alignments ali_path = Path(ali_path).expanduser().resolve() tmp = [reversed(line.split()) for line in (ali_path / 'phones.txt').read_text().splitlines()] id2phone = {int(k): v for k, v in tmp} assert len(id2phone) == len(tmp) _alignments = pb_chime5.kaldi.alignment.import_alignment_data( ali_path, import_fn=pb_chime5.kaldi.alignment.import_phone_alignment_from_file, per_frame=True, model_name=ali_path / 'final.mdl' ) alignments = _helper( _alignments, channel_preference=channel_preference, # id2phone=id2phone, unique_per_utt=unique_per_utt, use_kaldi_id=use_kaldi_id, ) return Dispatcher(cy_alignment_id2phone(alignments, id2phone))
def load_audio( path, *, frames=-1, start=0, stop=None, dtype=np.float64, fill_value=None, expected_sample_rate=None, unit='samples', return_sample_rate=False, ): """ WIP will deprecate audioread in the future Difference to soundfile.read: - Default: Return only signal - With the argument "unit" the unit of frames, start and stop can be changed (stop currently unsupported). - With given expected_sample_rate an assert is included (recommended) soundfile.read doc text and some examples: Provide audio data from a sound file as NumPy array. By default, the whole file is read from the beginning, but the position to start reading can be specified with `start` and the number of frames to read can be specified with `frames`. Alternatively, a range can be specified with `start` and `stop`. If there is less data left in the file than requested, the rest of the frames are filled with `fill_value`. If no `fill_value` is specified, a smaller array is returned. Parameters ---------- file : str or int or file-like object The file to read from. See :class:`SoundFile` for details. frames : int, optional The number of frames to read. If `frames` is negative, the whole rest of the file is read. Not allowed if `stop` is given. start : int, optional Where to start reading. A negative value counts from the end. stop : int, optional The index after the last frame to be read. A negative value counts from the end. Not allowed if `frames` is given. dtype : {'float64', 'float32', 'int32', 'int16'}, optional Data type of the returned array, by default ``'float64'``. Floating point audio data is typically in the range from ``-1.0`` to ``1.0``. Integer data is in the range from ``-2**15`` to ``2**15-1`` for ``'int16'`` and from ``-2**31`` to ``2**31-1`` for ``'int32'``. .. note:: Reading int values from a float file will *not* scale the data to [-1.0, 1.0). If the file contains ``np.array([42.6], dtype='float32')``, you will read ``np.array([43], dtype='int32')`` for ``dtype='int32'``. Returns ------- audiodata : numpy.ndarray or type(out) A two-dimensional (frames x channels) NumPy array is returned. If the sound file has only one channel, a one-dimensional array is returned. Use ``always_2d=True`` to return a two-dimensional array anyway. If `out` was specified, it is returned. If `out` has more frames than available in the file (or if `frames` is smaller than the length of `out`) and no `fill_value` is given, then only a part of `out` is overwritten and a view containing all valid frames is returned. Other Parameters ---------------- always_2d : bool, optional By default, reading a mono sound file will return a one-dimensional array. With ``always_2d=True``, audio data is always returned as a two-dimensional array, even if the audio file has only one channel. fill_value : float, optional If more frames are requested than available in the file, the rest of the output is be filled with `fill_value`. If `fill_value` is not specified, a smaller array is returned. out : numpy.ndarray or subclass, optional If `out` is specified, the data is written into the given array instead of creating a new array. In this case, the arguments `dtype` and `always_2d` are silently ignored! If `frames` is not given, it is obtained from the length of `out`. samplerate, channels, format, subtype, endian, closefd See :class:`SoundFile`. Examples -------- >>> from pb_chime5.io import load_audio >>> path = '/net/db/timit/pcm/train/dr1/fcjf0/sa1.wav' >>> data = load_audio(path) >>> data.shape (46797,) Say you load audio examples from a very long audio, you can provide a start position and a duration in samples or seconds. >>> path = '/net/db/timit/pcm/train/dr1/fcjf0/sa1.wav' >>> signal = load_audio(path, start=0, frames=16_000) >>> signal.shape (16000,) >>> signal = load_audio(path, start=0, frames=1, unit='seconds') >>> signal.shape (16000,) If the audio file is to short, only return the defined part: >>> signal = load_audio(path, start=0, frames=160_000) >>> signal.shape (46797,) >>> path = '/net/db/tidigits/tidigits/test/man/ah/111a.wav' >>> load_audio(path) #doctest: +ELLIPSIS Traceback (most recent call last): ... RuntimeError: /net/db/tidigits/tidigits/test/man/ah/111a.wav: NIST SPHERE file <BLANKLINE> """ # soundfile does not support pathlib.Path. # ToDo: Is this sill True? path = normalize_path(path, as_str=True) if unit == 'samples': pass elif unit == 'seconds': if stop is not None: if stop < 0: raise NotImplementedError(unit, stop) with soundfile.SoundFile(path) as f: # total_samples = len(f) samplerate = f.samplerate start = int(np.round(start * samplerate)) if frames > 0: frames = int(np.round(frames * samplerate)) if stop is not None and stop > 0: stop = int(np.round(stop * samplerate)) else: raise ValueError(unit) try: with soundfile.SoundFile( path, 'r', ) as f: if dtype is None: from pb_chime5.mapping import Dispatcher mapping = Dispatcher({ 'PCM_16': np.int16, 'FLOAT': np.float32, 'DOUBLE': np.float64, }) dtype = mapping[f.subtype] frames = f._prepare_read(start=start, stop=stop, frames=frames) data = f.read(frames=frames, dtype=dtype, fill_value=fill_value) signal, sample_rate = data, f.samplerate except RuntimeError as e: if isinstance(path, (Path, str)): if Path(path).suffix == '.wav': # Improve exception msg for NIST SPHERE files. from pb_chime5.utils.process_caller import run_process cp = run_process(f'file {path}') stdout = cp.stdout raise RuntimeError(f'{stdout}') from e else: raise RuntimeError(f'Wrong suffix {path.suffix} in {path}') raise if expected_sample_rate is not None: if expected_sample_rate != sample_rate: raise ValueError( f'Requested sampling rate is {expected_sample_rate} but the ' f'audiofile has {sample_rate}') # When signal is multichannel, than soundfile return (samples, channels) # At NT it is more common to have the shape (channels, samples) # => transpose signal = signal.T if return_sample_rate: return signal, sample_rate else: return signal