def __init__(self, root_dir: str, split: str, input_features_file_name: str, target_values_input_name: str) \ -> None: """TUT SED Synthetic 2016 dataset class. :param root_dir: The root directory for the dataset. :type root_dir: str :param split: The split for the dataset (e.g. training). :type split: str :param input_features_file_name: Input features file name. :type input_features_file_name: str :param target_values_input_name: Target values file name. :type target_values_input_name: str """ super(TUTSEDSynthetic2016, self).__init__() data_path = Path(root_dir, 'synthetic', split) x_path = data_path.joinpath(input_features_file_name) y_path = data_path.joinpath(target_values_input_name) self.x: ndarray = file_io.load_numpy_object(x_path) self.y: ndarray = file_io.load_numpy_object(y_path)
def _extract(data_file_name, settings_features, settings_data, dir_output_dev, dir_output_eva): # Load the data file. data_file = load_numpy_object(data_file_name) # Extract the features. features = feature_extraction(data_file['audio_data'].item(), **settings_features['process']) # Populate the recarray data and dtypes. array_data = (data_file['file_name'].item(), ) dtypes = [('file_name', data_file['file_name'].dtype)] # Check if we keeping the raw audio data. if settings_features['keep_raw_audio_data']: # And add them to the recarray data and dtypes. array_data += (data_file['audio_data'].item(), ) dtypes.append(('audio_data', data_file['audio_data'].dtype)) # Add the rest to the recarray. array_data += (features, data_file['caption'].item(), data_file['caption_ind'].item(), data_file['words_ind'].item(), data_file['chars_ind'].item()) dtypes.extend([('features', np.dtype(object)), ('caption', data_file['caption'].dtype), ('caption_ind', data_file['caption_ind'].dtype), ('words_ind', data_file['words_ind'].dtype), ('chars_ind', data_file['chars_ind'].dtype)]) # Make the recarray np_rec_array = np.rec.array([array_data], dtype=dtypes) # Make the path for serializing the recarray. parent_path = dir_output_dev \ if data_file_name.parent.name == settings_data['audio_dirs']['development'] \ else dir_output_eva file_path = parent_path.joinpath(data_file_name.name) # Dump it. dump_numpy_object(np_rec_array, file_path)
def extract_features(root_dir: str, settings_data: MutableMapping[str, Any], settings_features: MutableMapping[str, Any]) \ -> None: """Extracts features from the audio data of Clotho. :param root_dir: Root dir for the data. :type root_dir: str :param settings_data: Settings for creating data files. :type settings_data: dict[str, T] :param settings_features: Settings for feature extraction. :type settings_features: dict[str, T] """ # Get the root directory. dir_root = Path(root_dir) # Get the directories of files. dir_output = dir_root.joinpath(settings_data['audio_dirs']['output']) dir_dev = dir_output.joinpath(settings_data['audio_dirs']['development']) dir_eva = dir_output.joinpath(settings_data['audio_dirs']['evaluation']) # Get the directories for output. dir_output_dev = dir_root.joinpath( settings_data['features_dirs']['output'], settings_data['features_dirs']['development']) dir_output_eva = dir_root.joinpath( settings_data['features_dirs']['output'], settings_data['features_dirs']['evaluation']) # Create the directories. dir_output_dev.mkdir(parents=True, exist_ok=True) dir_output_eva.mkdir(parents=True, exist_ok=True) # Apply the function to each file and save the result. for data_file_name in filter(lambda _x: _x.suffix == '.npy', chain(dir_dev.iterdir(), dir_eva.iterdir())): # Load the data file. data_file = load_numpy_object(data_file_name) # Extract the features. features = feature_extraction(data_file['audio_data'].item(), **settings_features['process']) # Populate the recarray data and dtypes. array_data = (data_file['file_name'].item(), ) dtypes = [('file_name', data_file['file_name'].dtype)] # Check if we keeping the raw audio data. if settings_features['keep_raw_audio_data']: # And add them to the recarray data and dtypes. array_data += (data_file['audio_data'].item(), ) dtypes.append(('audio_data', data_file['audio_data'].dtype)) # Add the rest to the recarray. array_data += (features, data_file['caption'].item(), data_file['caption_ind'].item(), data_file['words_ind'].item(), data_file['chars_ind'].item()) dtypes.extend([('features', np.dtype(object)), ('caption', data_file['caption'].dtype), ('caption_ind', data_file['caption_ind'].dtype), ('words_ind', data_file['words_ind'].dtype), ('chars_ind', data_file['chars_ind'].dtype)]) # Make the recarray np_rec_array = np.rec.array([array_data], dtype=dtypes) # Make the path for serializing the recarray. parent_path = dir_output_dev \ if data_file_name.parent.name == settings_data['audio_dirs']['development'] \ else dir_output_eva file_path = parent_path.joinpath(data_file_name.name) # Dump it. dump_numpy_object(np_rec_array, file_path)
def check_data_for_split(dir_audio: Path, dir_data: Path, dir_root: Path, csv_split: MutableSequence[MutableMapping[str, str]], settings_ann: MutableMapping[str, Any], settings_audio: MutableMapping[str, Any], settings_cntr: MutableMapping[str, Any]) -> None: """Goes through all audio files and checks the created data. Gets each audio file and checks if there are associated data. If there are,\ checks the validity of the raw audio data and the validity of the captions,\ words, and characters. :param dir_audio: Directory with the audio files. :type dir_audio: pathlib.Path :param dir_data: Directory with the data to be checked. :type dir_data: pathlib.Path :param dir_root: Root directory. :type dir_root: pathlib.Path :param csv_split: CSV entries for the data/ :type csv_split: list[collections.OrderedDict] :param settings_ann: Settings for annotations. :type settings_ann: dict :param settings_audio: Settings for audio. :type settings_audio: dict :param settings_cntr: Settings for counters. :type settings_cntr: dict """ # Load the words and characters lists words_list = load_pickle_file( dir_root.joinpath(settings_cntr['words_list_file_name'])) chars_list = load_pickle_file( dir_root.joinpath(settings_cntr['characters_list_file_name'])) for csv_entry in csv_split: # Get audio file name file_name_audio = Path(csv_entry[settings_ann['audio_file_column']]) # Check if the audio file existed originally if not dir_audio.joinpath(file_name_audio).exists(): raise FileExistsError( 'Audio file {f_name_audio} not exists in {d_audio}'.format( f_name_audio=file_name_audio, d_audio=dir_audio)) # Flag for checking if there are data files for the audio file audio_has_data_files = False # Get the original audio data data_audio_original = load_audio_file(audio_file=str( dir_audio.joinpath(file_name_audio)), sr=int(settings_audio['sr']), mono=settings_audio['to_mono']) for data_file in dir_root.joinpath(dir_data).iterdir(): # Get the stem of the audio file name f_stem = str(data_file).split('file_')[-1].split('.wav_')[0] if f_stem == file_name_audio.stem: audio_has_data_files = True # Get the numpy record array data_array = load_numpy_object(data_file) # Get the audio data from the numpy record array data_audio_rec_array = data_array['audio_data'].item() # Compare the lengths if len(data_audio_rec_array) != len(data_audio_original): raise ValueError( 'File {f_audio} was not saved successfully to the numpy ' 'object {f_np}.'.format(f_audio=file_name_audio, f_np=data_file)) # Check all elements, one to one if not all([ data_audio_original[i] == data_audio_rec_array[i] for i in range(len(data_audio_original)) ]): raise ValueError( 'Numpy object {} has wrong audio data.'.format( data_file)) # Get the original caption caption_index = data_array['caption_ind'].item() # Clean it to remove any spaces before punctuation. original_caption = clean_sentence( sentence=csv_entry[settings_ann['captions_fields_prefix']. format(caption_index + 1)], keep_case=True, remove_punctuation=False, remove_specials=not settings_ann['use_special_tokens']) # Check with the file caption caption_data_array = clean_sentence( sentence=data_array['caption'].item(), keep_case=True, remove_punctuation=False, remove_specials=not settings_ann['use_special_tokens']) if not original_caption == caption_data_array: raise ValueError( 'Numpy object {} has wrong caption.'.format(data_file)) # Since caption in the file is OK, we can use it instead of # the original, because it already has the special tokens. caption_data_array = clean_sentence( sentence=data_array['caption'].item(), keep_case=settings_ann['keep_case'], remove_punctuation=settings_ann[ 'remove_punctuation_words'], remove_specials=not settings_ann['use_special_tokens']) # Check with the indices of words words_indices = data_array['words_ind'].item() caption_form_words = ' '.join( [words_list[i] for i in words_indices]) if not caption_data_array == caption_form_words: raise ValueError( 'Numpy object {} has wrong words indices.'.format( data_file)) # Check with the indices of characters caption_from_chars = ''.join( [chars_list[i] for i in data_array['chars_ind'].item()]) caption_data_array = clean_sentence( sentence=data_array['caption'].item(), keep_case=settings_ann['keep_case'], remove_punctuation=settings_ann[ 'remove_punctuation_chars'], remove_specials=not settings_ann['use_special_tokens']) if not caption_data_array == caption_from_chars: raise ValueError('Numpy object {} has wrong characters ' 'indices.'.format(data_file)) if not audio_has_data_files: raise FileExistsError( 'Audio file {} has no associated data.'.format( file_name_audio))