def check_data_for_split(dir_audio: Path, dir_data: Path, dir_root: Path, csv_split: MutableSequence[MutableMapping[str, str]], settings_ann: MutableMapping[str, Any], settings_audio: MutableMapping[str, Any], settings_cntr: MutableMapping[str, Any]) -> None: """Goes through all audio files and checks the created data. Gets each audio file and checks if there are associated data. If there are,\ checks the validity of the raw audio data and the validity of the captions,\ words, and characters. :param dir_audio: Directory with the audio files. :type dir_audio: pathlib.Path :param dir_data: Directory with the data to be checked. :type dir_data: pathlib.Path :param dir_root: Root directory. :type dir_root: pathlib.Path :param csv_split: CSV entries for the data/ :type csv_split: list[collections.OrderedDict] :param settings_ann: Settings for annotations. :type settings_ann: dict :param settings_audio: Settings for audio. :type settings_audio: dict :param settings_cntr: Settings for counters. :type settings_cntr: dict """ # Load the words and characters lists words_list = load_pickle_file( dir_root.joinpath(settings_cntr['words_list_file_name'])) chars_list = load_pickle_file( dir_root.joinpath(settings_cntr['characters_list_file_name'])) for csv_entry in csv_split: # Get audio file name file_name_audio = Path(csv_entry[settings_ann['audio_file_column']]) # Check if the audio file existed originally if not dir_audio.joinpath(file_name_audio).exists(): raise FileExistsError( 'Audio file {f_name_audio} not exists in {d_audio}'.format( f_name_audio=file_name_audio, d_audio=dir_audio)) # Flag for checking if there are data files for the audio file audio_has_data_files = False # Get the original audio data data_audio_original = load_audio_file(audio_file=str( dir_audio.joinpath(file_name_audio)), sr=int(settings_audio['sr']), mono=settings_audio['to_mono']) for data_file in dir_root.joinpath(dir_data).iterdir(): # Get the stem of the audio file name f_stem = str(data_file).split('file_')[-1].split('.wav_')[0] if f_stem == file_name_audio.stem: audio_has_data_files = True # Get the numpy record array data_array = load_numpy_object(data_file) # Get the audio data from the numpy record array data_audio_rec_array = data_array['audio_data'].item() # Compare the lengths if len(data_audio_rec_array) != len(data_audio_original): raise ValueError( 'File {f_audio} was not saved successfully to the numpy ' 'object {f_np}.'.format(f_audio=file_name_audio, f_np=data_file)) # Check all elements, one to one if not all([ data_audio_original[i] == data_audio_rec_array[i] for i in range(len(data_audio_original)) ]): raise ValueError( 'Numpy object {} has wrong audio data.'.format( data_file)) # Get the original caption caption_index = data_array['caption_ind'].item() # Clean it to remove any spaces before punctuation. original_caption = clean_sentence( sentence=csv_entry[settings_ann['captions_fields_prefix']. format(caption_index + 1)], keep_case=True, remove_punctuation=False, remove_specials=not settings_ann['use_special_tokens']) # Check with the file caption caption_data_array = clean_sentence( sentence=data_array['caption'].item(), keep_case=True, remove_punctuation=False, remove_specials=not settings_ann['use_special_tokens']) if not original_caption == caption_data_array: raise ValueError( 'Numpy object {} has wrong caption.'.format(data_file)) # Since caption in the file is OK, we can use it instead of # the original, because it already has the special tokens. caption_data_array = clean_sentence( sentence=data_array['caption'].item(), keep_case=settings_ann['keep_case'], remove_punctuation=settings_ann[ 'remove_punctuation_words'], remove_specials=not settings_ann['use_special_tokens']) # Check with the indices of words words_indices = data_array['words_ind'].item() caption_form_words = ' '.join( [words_list[i] for i in words_indices]) if not caption_data_array == caption_form_words: raise ValueError( 'Numpy object {} has wrong words indices.'.format( data_file)) # Check with the indices of characters caption_from_chars = ''.join( [chars_list[i] for i in data_array['chars_ind'].item()]) caption_data_array = clean_sentence( sentence=data_array['caption'].item(), keep_case=settings_ann['keep_case'], remove_punctuation=settings_ann[ 'remove_punctuation_chars'], remove_specials=not settings_ann['use_special_tokens']) if not caption_data_array == caption_from_chars: raise ValueError('Numpy object {} has wrong characters ' 'indices.'.format(data_file)) if not audio_has_data_files: raise FileExistsError( 'Audio file {} has no associated data.'.format( file_name_audio))
def extract_features_test(root_dir: str, settings_data: MutableMapping[str, Any], settings_features: MutableMapping[str, Any], settings_audio: MutableMapping[str, Any]) \ -> None: """Extracts test features from the audio data of Clotho. :param root_dir: Root dir for the data. :type root_dir: str :param settings_data: Settings for creating data files. :type settings_data: dict[str, T] :param settings_features: Settings for feature extraction. :type settings_features: dict[str, T] :param settings_audio: Settings for the audio. :type settings_audio: dict """ # Get the root directory. dir_root = Path(root_dir) # Get the directories of files. dir_test = dir_root.joinpath(settings_data['audio_dirs']['downloaded'], settings_data['audio_dirs']['test']) audio_exists = False if dir_test.exists() and len(list(dir_test.iterdir())) != 0: audio_exists = True if not audio_exists: raise AttributeError( 'Testing workflow selected, but could not find the test set audio files. ' 'Please download the test set audio before making test predictions.' ) # Get the directories for output. dir_output_test = dir_root.joinpath( settings_data['features_dirs']['output'], settings_data['features_dirs']['test']) words_list = load_pickle_file( dir_root.joinpath(settings_data['pickle_files_dir'], settings_data['files']['words_list_file_name'])) # Create the directories. dir_output_test.mkdir(parents=True, exist_ok=True) # Apply the function to each file and save the result. for data_file_name in filter(lambda _x: _x.is_file(), dir_test.iterdir()): # Load the audio audio = load_audio_file(audio_file=str(data_file_name), sr=int(settings_audio['sr']), mono=settings_audio['to_mono']) # Extract the features. features = feature_extraction(audio, **settings_features['process']) # Populate the recarray data and dtypes. array_data = (data_file_name.name, ) dtypes = [('file_name', f'U{len(data_file_name.name)}')] # Check if we keeping the raw audio data. if settings_features['keep_raw_audio_data']: # And add them to the recarray data and dtypes. array_data += (audio, ) dtypes.append(('audio_data', audio.dtype)) # Add the rest to the recarray. # Word indices are required for the dataloader to work array_data += (features, np.array([ words_list.index('<sos>'), words_list.index('<eos>') ])) dtypes.extend([('features', np.dtype(object)), ('words_ind', np.dtype(object))]) # Make the recarray np_rec_array = np.rec.array([array_data], dtype=dtypes) # Make the path for serializing the recarray. parent_path = dir_output_test file_template = settings_data['files'][ 'np_file_name_template'].replace('_{caption_index}', '') file_path = parent_path.joinpath( file_template.format(audio_file_name=data_file_name.name)) # Dump it. dump_numpy_object(np_rec_array, file_path)
def create_split_data(csv_split: MutableSequence[MutableMapping[str, str]], dir_split: Path, dir_audio: Path, dir_root: Path, words_list: MutableSequence[str], chars_list: MutableSequence[str], settings_ann: MutableMapping[str, Any], settings_audio: MutableMapping[str, Any], settings_output: MutableMapping[str, Any]) -> None: """Creates the data for the split. :param csv_split: Annotations of the split. :type csv_split: list[collections.OrderedDict] :param dir_split: Directory for the split. :type dir_split: pathlib.Path :param dir_audio: Directory of the audio files for the split. :type dir_audio: pathlib.Path :param dir_root: Root directory of data. :type dir_root: pathlib.Path :param words_list: List of the words. :type words_list: list[str] :param chars_list: List of the characters. :type chars_list: list[str] :param settings_ann: Settings for the annotations. :type settings_ann: dict :param settings_audio: Settings for the audio. :type settings_audio: dict :param settings_output: Settings for the output files. :type settings_output: dict """ # Make sure that the directory exists dir_split.mkdir(parents=True, exist_ok=True) captions_fields = [ settings_ann['captions_fields_prefix'].format(i) for i in range(1, int(settings_ann['nb_captions']) + 1) ] # For each sound: for csv_entry in csv_split: file_name_audio = csv_entry[settings_ann['audio_file_column']] audio = load_audio_file(audio_file=str( dir_root.joinpath(dir_audio, file_name_audio)), sr=int(settings_audio['sr']), mono=settings_audio['to_mono']) for caption_ind, caption_field in enumerate(captions_fields): caption = csv_entry[caption_field] words_caption = get_sentence_words( caption, unique=settings_ann['use_unique_words_per_caption'], keep_case=settings_ann['keep_case'], remove_punctuation=settings_ann['remove_punctuation_words'], remove_specials=not settings_ann['use_special_tokens']) chars_caption = list( chain.from_iterable( clean_sentence(caption, keep_case=settings_ann['keep_case'], remove_punctuation=settings_ann[ 'remove_punctuation_chars'], remove_specials=True))) if settings_ann['use_special_tokens']: chars_caption.insert(0, ' ') chars_caption.insert(0, '<sos>') chars_caption.append(' ') chars_caption.append('<eos>') indices_words = [words_list.index(word) for word in words_caption] indices_chars = [chars_list.index(char) for char in chars_caption] # create the numpy object with all elements np_rec_array = np.rec.array( np.array( (file_name_audio, audio, caption, caption_ind, np.array(indices_words), np.array(indices_chars)), dtype=[('file_name', 'U{}'.format(len(file_name_audio))), ('audio_data', np.dtype(object)), ('caption', 'U{}'.format(len(caption))), ('caption_ind', 'i4'), ('words_ind', np.dtype(object)), ('chars_ind', np.dtype(object))])) # save the numpy object to disk dump_numpy_object( np_obj=np_rec_array, file_name=str( dir_split.joinpath( settings_output['file_name_template'].format( audio_file_name=file_name_audio, caption_index=caption_ind))))