Пример #1
0
    def __init__(self,
                 root_dir: str,
                 split: str,
                 input_features_file_name: str,
                 target_values_input_name: str) \
            -> None:
        """TUT SED Synthetic 2016 dataset class. 
        
        :param root_dir: The root directory for the dataset. 
        :type root_dir: str
        :param split: The split for the dataset (e.g. training).
        :type split: str
        :param input_features_file_name: Input features file name.
        :type input_features_file_name: str
        :param target_values_input_name: Target values file name.
        :type target_values_input_name: str
        """
        super(TUTSEDSynthetic2016, self).__init__()
        data_path = Path(root_dir, 'synthetic', split)

        x_path = data_path.joinpath(input_features_file_name)
        y_path = data_path.joinpath(target_values_input_name)

        self.x: ndarray = file_io.load_numpy_object(x_path)
        self.y: ndarray = file_io.load_numpy_object(y_path)
def _extract(data_file_name, settings_features, settings_data, dir_output_dev,
             dir_output_eva):
    # Load the data file.
    data_file = load_numpy_object(data_file_name)

    # Extract the features.
    features = feature_extraction(data_file['audio_data'].item(),
                                  **settings_features['process'])

    # Populate the recarray data and dtypes.
    array_data = (data_file['file_name'].item(), )
    dtypes = [('file_name', data_file['file_name'].dtype)]

    # Check if we keeping the raw audio data.
    if settings_features['keep_raw_audio_data']:
        # And add them to the recarray data and dtypes.
        array_data += (data_file['audio_data'].item(), )
        dtypes.append(('audio_data', data_file['audio_data'].dtype))

    # Add the rest to the recarray.
    array_data += (features, data_file['caption'].item(),
                   data_file['caption_ind'].item(),
                   data_file['words_ind'].item(),
                   data_file['chars_ind'].item())
    dtypes.extend([('features', np.dtype(object)),
                   ('caption', data_file['caption'].dtype),
                   ('caption_ind', data_file['caption_ind'].dtype),
                   ('words_ind', data_file['words_ind'].dtype),
                   ('chars_ind', data_file['chars_ind'].dtype)])

    # Make the recarray
    np_rec_array = np.rec.array([array_data], dtype=dtypes)

    # Make the path for serializing the recarray.
    parent_path = dir_output_dev \
        if data_file_name.parent.name == settings_data['audio_dirs']['development'] \
        else dir_output_eva

    file_path = parent_path.joinpath(data_file_name.name)

    # Dump it.
    dump_numpy_object(np_rec_array, file_path)
Пример #3
0
def extract_features(root_dir: str,
                     settings_data: MutableMapping[str, Any],
                     settings_features: MutableMapping[str, Any]) \
        -> None:
    """Extracts features from the audio data of Clotho.
    :param root_dir: Root dir for the data.
    :type root_dir: str
    :param settings_data: Settings for creating data files.
    :type settings_data: dict[str, T]
    :param settings_features: Settings for feature extraction.
    :type settings_features: dict[str, T]
    """
    # Get the root directory.
    dir_root = Path(root_dir)

    # Get the directories of files.
    dir_output = dir_root.joinpath(settings_data['audio_dirs']['output'])

    dir_dev = dir_output.joinpath(settings_data['audio_dirs']['development'])
    dir_eva = dir_output.joinpath(settings_data['audio_dirs']['evaluation'])

    # Get the directories for output.
    dir_output_dev = dir_root.joinpath(
        settings_data['features_dirs']['output'],
        settings_data['features_dirs']['development'])
    dir_output_eva = dir_root.joinpath(
        settings_data['features_dirs']['output'],
        settings_data['features_dirs']['evaluation'])

    # Create the directories.
    dir_output_dev.mkdir(parents=True, exist_ok=True)
    dir_output_eva.mkdir(parents=True, exist_ok=True)

    # Apply the function to each file and save the result.
    for data_file_name in filter(lambda _x: _x.suffix == '.npy',
                                 chain(dir_dev.iterdir(), dir_eva.iterdir())):

        # Load the data file.
        data_file = load_numpy_object(data_file_name)

        # Extract the features.
        features = feature_extraction(data_file['audio_data'].item(),
                                      **settings_features['process'])

        # Populate the recarray data and dtypes.
        array_data = (data_file['file_name'].item(), )
        dtypes = [('file_name', data_file['file_name'].dtype)]

        # Check if we keeping the raw audio data.
        if settings_features['keep_raw_audio_data']:
            # And add them to the recarray data and dtypes.
            array_data += (data_file['audio_data'].item(), )
            dtypes.append(('audio_data', data_file['audio_data'].dtype))

        # Add the rest to the recarray.
        array_data += (features, data_file['caption'].item(),
                       data_file['caption_ind'].item(),
                       data_file['words_ind'].item(),
                       data_file['chars_ind'].item())
        dtypes.extend([('features', np.dtype(object)),
                       ('caption', data_file['caption'].dtype),
                       ('caption_ind', data_file['caption_ind'].dtype),
                       ('words_ind', data_file['words_ind'].dtype),
                       ('chars_ind', data_file['chars_ind'].dtype)])

        # Make the recarray
        np_rec_array = np.rec.array([array_data], dtype=dtypes)

        # Make the path for serializing the recarray.
        parent_path = dir_output_dev \
            if data_file_name.parent.name == settings_data['audio_dirs']['development'] \
            else dir_output_eva

        file_path = parent_path.joinpath(data_file_name.name)

        # Dump it.
        dump_numpy_object(np_rec_array, file_path)
Пример #4
0
def check_data_for_split(dir_audio: Path, dir_data: Path, dir_root: Path,
                         csv_split: MutableSequence[MutableMapping[str, str]],
                         settings_ann: MutableMapping[str, Any],
                         settings_audio: MutableMapping[str, Any],
                         settings_cntr: MutableMapping[str, Any]) -> None:
    """Goes through all audio files and checks the created data.

    Gets each audio file and checks if there are associated data. If there are,\
    checks the validity of the raw audio data and the validity of the captions,\
    words, and characters.

    :param dir_audio: Directory with the audio files.
    :type dir_audio: pathlib.Path
    :param dir_data: Directory with the data to be checked.
    :type dir_data: pathlib.Path
    :param dir_root: Root directory.
    :type dir_root: pathlib.Path
    :param csv_split: CSV entries for the data/
    :type csv_split: list[collections.OrderedDict]
    :param settings_ann: Settings for annotations.
    :type settings_ann: dict
    :param settings_audio: Settings for audio.
    :type settings_audio: dict
    :param settings_cntr: Settings for counters.
    :type settings_cntr: dict
    """
    # Load the words and characters lists
    words_list = load_pickle_file(
        dir_root.joinpath(settings_cntr['words_list_file_name']))
    chars_list = load_pickle_file(
        dir_root.joinpath(settings_cntr['characters_list_file_name']))

    for csv_entry in csv_split:
        # Get audio file name
        file_name_audio = Path(csv_entry[settings_ann['audio_file_column']])

        # Check if the audio file existed originally
        if not dir_audio.joinpath(file_name_audio).exists():
            raise FileExistsError(
                'Audio file {f_name_audio} not exists in {d_audio}'.format(
                    f_name_audio=file_name_audio, d_audio=dir_audio))

        # Flag for checking if there are data files for the audio file
        audio_has_data_files = False

        # Get the original audio data
        data_audio_original = load_audio_file(audio_file=str(
            dir_audio.joinpath(file_name_audio)),
                                              sr=int(settings_audio['sr']),
                                              mono=settings_audio['to_mono'])

        for data_file in dir_root.joinpath(dir_data).iterdir():
            # Get the stem of the audio file name
            f_stem = str(data_file).split('file_')[-1].split('.wav_')[0]

            if f_stem == file_name_audio.stem:
                audio_has_data_files = True
                # Get the numpy record array
                data_array = load_numpy_object(data_file)

                # Get the audio data from the numpy record array
                data_audio_rec_array = data_array['audio_data'].item()

                # Compare the lengths
                if len(data_audio_rec_array) != len(data_audio_original):
                    raise ValueError(
                        'File {f_audio} was not saved successfully to the numpy '
                        'object {f_np}.'.format(f_audio=file_name_audio,
                                                f_np=data_file))

                # Check all elements, one to one
                if not all([
                        data_audio_original[i] == data_audio_rec_array[i]
                        for i in range(len(data_audio_original))
                ]):
                    raise ValueError(
                        'Numpy object {} has wrong audio data.'.format(
                            data_file))

                # Get the original caption
                caption_index = data_array['caption_ind'].item()

                # Clean it to remove any spaces before punctuation.
                original_caption = clean_sentence(
                    sentence=csv_entry[settings_ann['captions_fields_prefix'].
                                       format(caption_index + 1)],
                    keep_case=True,
                    remove_punctuation=False,
                    remove_specials=not settings_ann['use_special_tokens'])

                # Check with the file caption
                caption_data_array = clean_sentence(
                    sentence=data_array['caption'].item(),
                    keep_case=True,
                    remove_punctuation=False,
                    remove_specials=not settings_ann['use_special_tokens'])

                if not original_caption == caption_data_array:
                    raise ValueError(
                        'Numpy object {} has wrong caption.'.format(data_file))

                # Since caption in the file is OK, we can use it instead of
                # the original, because it already has the special tokens.
                caption_data_array = clean_sentence(
                    sentence=data_array['caption'].item(),
                    keep_case=settings_ann['keep_case'],
                    remove_punctuation=settings_ann[
                        'remove_punctuation_words'],
                    remove_specials=not settings_ann['use_special_tokens'])

                # Check with the indices of words
                words_indices = data_array['words_ind'].item()
                caption_form_words = ' '.join(
                    [words_list[i] for i in words_indices])

                if not caption_data_array == caption_form_words:
                    raise ValueError(
                        'Numpy object {} has wrong words indices.'.format(
                            data_file))

                # Check with the indices of characters
                caption_from_chars = ''.join(
                    [chars_list[i] for i in data_array['chars_ind'].item()])

                caption_data_array = clean_sentence(
                    sentence=data_array['caption'].item(),
                    keep_case=settings_ann['keep_case'],
                    remove_punctuation=settings_ann[
                        'remove_punctuation_chars'],
                    remove_specials=not settings_ann['use_special_tokens'])

                if not caption_data_array == caption_from_chars:
                    raise ValueError('Numpy object {} has wrong characters '
                                     'indices.'.format(data_file))

        if not audio_has_data_files:
            raise FileExistsError(
                'Audio file {} has no associated data.'.format(
                    file_name_audio))