Exemplo n.º 1
0
def main():
    all_entries = csv_functions.read_csv_file(
        'clotho_captions_development.csv', 'data')

    all_words = []

    for entry in all_entries:
        entry_words = [
            captions_functions.get_sentence_words(v, unique=True)
            for k, v in entry.items() if not k.startswith('file')
        ]
        all_words.extend(list(set(chain.from_iterable(entry_words))))

    counter = Counter(all_words)

    results = []
    max_min = 0
    files_to_use = []
    max_files = 50

    for entry in all_entries:
        captions = [v for k, v in entry.items() if not k.startswith('file')]
        min_freq = 1e6
        for caption in captions:
            min_freq = min(
                min_freq, *[
                    counter.get(word) for word in
                    captions_functions.get_sentence_words(caption, unique=True)
                ])
        max_min = max(max_min, min_freq)
        results.append({'file': entry.get('file_name'), 'min_freq': min_freq})
        if 10 < min_freq < 20:
            files_to_use.append(entry.get('file_name'))

    print(f'Max minimum freq is {max_min}')
    print(f'Amount of files that I can use is {len(files_to_use)}')
    # plt.hist([k['min_freq'] for k in results],
    #          bins=max_min,
    #          histtype='stepfilled')
    # plt.grid()
    # plt.show()

    x = np.arange(len(files_to_use))
    final_files = [
        files_to_use[i] for i in np.random.permutation(x)[:max_files]
    ]
    [print(f'File {i+1:02d} is {f}') for i, f in enumerate(final_files)]

    p = Path('validation_file_names.pickle')

    print('Saving list of validation files...', end=' ')
    with p.open('wb') as f:
        pickle.dump(final_files, f)
    print('done.')
Exemplo n.º 2
0
def get_hist_of_words(files: MutableSequence[Path],
                      data_dir: Path) \
        -> Tuple[List[str], List[int]]:
    all_captions = get_all_captions_from_files(files=files, data_dir=data_dir)

    c = Counter(
        chain.from_iterable([
            captions_functions.get_sentence_words(v, unique=False)
            for i in all_captions for v in i.values()
        ]))

    return list(c.keys()), list(c.values())
Exemplo n.º 3
0
def create_split_data(csv_split: MutableSequence[MutableMapping[str, str]],
                      dir_split: Path, dir_audio: Path, dir_root: Path,
                      words_list: MutableSequence[str],
                      chars_list: MutableSequence[str],
                      settings_ann: MutableMapping[str, Any],
                      settings_audio: MutableMapping[str, Any],
                      settings_output: MutableMapping[str, Any]) -> None:
    """Creates the data for the split.

    :param csv_split: Annotations of the split.
    :type csv_split: list[collections.OrderedDict]
    :param dir_split: Directory for the split.
    :type dir_split: pathlib.Path
    :param dir_audio: Directory of the audio files for the split.
    :type dir_audio: pathlib.Path
    :param dir_root: Root directory of data.
    :type dir_root: pathlib.Path
    :param words_list: List of the words.
    :type words_list: list[str]
    :param chars_list: List of the characters.
    :type chars_list: list[str]
    :param settings_ann: Settings for the annotations.
    :type settings_ann: dict
    :param settings_audio: Settings for the audio.
    :type settings_audio: dict
    :param settings_output: Settings for the output files.
    :type settings_output: dict
    """
    # Make sure that the directory exists
    dir_split.mkdir(parents=True, exist_ok=True)

    captions_fields = [
        settings_ann['captions_fields_prefix'].format(i)
        for i in range(1,
                       int(settings_ann['nb_captions']) + 1)
    ]

    # For each sound:
    for csv_entry in csv_split:
        file_name_audio = csv_entry[settings_ann['audio_file_column']]

        audio = load_audio_file(audio_file=str(
            dir_root.joinpath(dir_audio, file_name_audio)),
                                sr=int(settings_audio['sr']),
                                mono=settings_audio['to_mono'])

        for caption_ind, caption_field in enumerate(captions_fields):
            caption = csv_entry[caption_field]

            words_caption = get_sentence_words(
                caption,
                unique=settings_ann['use_unique_words_per_caption'],
                keep_case=settings_ann['keep_case'],
                remove_punctuation=settings_ann['remove_punctuation_words'],
                remove_specials=not settings_ann['use_special_tokens'])

            chars_caption = list(
                chain.from_iterable(
                    clean_sentence(caption,
                                   keep_case=settings_ann['keep_case'],
                                   remove_punctuation=settings_ann[
                                       'remove_punctuation_chars'],
                                   remove_specials=True)))

            if settings_ann['use_special_tokens']:
                chars_caption.insert(0, ' ')
                chars_caption.insert(0, '<sos>')
                chars_caption.append(' ')
                chars_caption.append('<eos>')

            indices_words = [words_list.index(word) for word in words_caption]
            indices_chars = [chars_list.index(char) for char in chars_caption]

            #   create the numpy object with all elements
            np_rec_array = np.rec.array(
                np.array(
                    (file_name_audio, audio, caption, caption_ind,
                     np.array(indices_words), np.array(indices_chars)),
                    dtype=[('file_name', 'U{}'.format(len(file_name_audio))),
                           ('audio_data', np.dtype(object)),
                           ('caption', 'U{}'.format(len(caption))),
                           ('caption_ind', 'i4'),
                           ('words_ind', np.dtype(object)),
                           ('chars_ind', np.dtype(object))]))

            #   save the numpy object to disk
            dump_numpy_object(
                np_obj=np_rec_array,
                file_name=str(
                    dir_split.joinpath(
                        settings_output['file_name_template'].format(
                            audio_file_name=file_name_audio,
                            caption_index=caption_ind))))