Exemplo n.º 1
0
def main():

    args = parser.parse_args()
    htk_save_path = mkdir(args.htk_save_path)
    path = Path(data_path=args.data_path,
                config_path=args.config_path)

    # HTK settings
    save_config(audio_file_type='nist',
                feature_type=args.feature_type,
                channels=args.channels,
                config_save_path='./config',
                sampling_rate=16000,
                window=args.window,
                slide=args.slide,
                energy=bool(args.energy),
                delta=bool(args.delta),
                deltadelta=bool(args.deltadelta))
    # NOTE: 123-dim features are extracted by default

    for data_type in ['train', 'dev', 'test']:

        wav_paths = path.wav(data_type=data_type)
        save_path = mkdir_join(htk_save_path, data_type)

        with open('./config/wav2htk_' + data_type + '.scp', 'w') as f:
            for wav_path in wav_paths:
                speaker = wav_path.split('/')[-2]
                utt_index = basename(wav_path).split('.')[0]
                save_path_tmp = mkdir_join(
                    save_path, speaker, utt_index + '.htk')
                f.write(wav_path + '  ' + save_path_tmp + '\n')
Exemplo n.º 2
0
def main():

    args = parser.parse_args()
    htk_save_path = mkdir(args.htk_save_path)

    # HTK settings
    save_config(audio_file_type='wav',
                feature_type=args.feature_type,
                channels=args.channels,
                config_save_path='./config',
                sampling_rate=8000,
                window=args.window,
                slide=args.slide,
                energy=bool(args.energy),
                delta=bool(args.delta),
                deltadelta=bool(args.deltadelta))
    # NOTE: 120-dim features are extracted by default

    # Switchboard
    with open('./config/wav2htk_swbd.scp', 'w') as f:
        for wav_path in glob(join(args.wav_save_path, 'swbd/*.wav')):
            # ex.) wav_path: wav/swbd/*.wav
            save_path = mkdir_join(htk_save_path, 'swbd',
                                   basename(wav_path).split('.')[0] + '.htk')
            f.write(wav_path + '  ' + save_path + '\n')
            # ex.) htk_path: wav/swbd/*.htk

    # eval2000 (swbd)
    with open('./config/wav2htk_eval2000_swbd.scp', 'w') as f:
        for wav_path in glob(join(args.wav_save_path, 'eval2000/swbd/*.wav')):
            # ex.) wav_path: wav/eval2000_swbd/*.wav
            save_path = mkdir_join(htk_save_path, 'eval2000', 'swbd',
                                   basename(wav_path).split('.')[0] + '.htk')
            f.write(wav_path + '  ' + save_path + '\n')
            # ex.) htk_path: wav/eval2000/swbd/*.htk

    # eval2000 (callhome)
    with open('./config/wav2htk_eval2000_ch.scp', 'w') as f:
        for wav_path in glob(
                join(args.wav_save_path, 'eval2000/callhome/*.wav')):
            # ex.) wav_path: wav/eval2000_ch/*.wav
            save_path = mkdir_join(htk_save_path, 'eval2000', 'callhome',
                                   basename(wav_path).split('.')[0] + '.htk')
            f.write(wav_path + '  ' + save_path + '\n')
            # ex.) htk_path: wav/eval2000/callhome/*.htk

    # Fisher
    if bool(args.fisher):
        with open('./config/wav2htk_fisher.scp', 'w') as f:
            for wav_path in glob(join(args.wav_save_path, 'fisher/*/*.wav')):
                # ex.) wav_path: wav/fisher/speaker/*.wav
                speaker = wav_path.split('/')[-2]
                save_path = mkdir_join(
                    htk_save_path, 'fisher', speaker,
                    basename(wav_path).split('.')[0] + '.htk')
                f.write(wav_path + '  ' + save_path + '\n')
Exemplo n.º 3
0
def split_wav(wav_paths, save_path, speaker_dict):
    """Read WAV files & divide them with respect to each utterance.
    Args:
        wav_paths (list): path to WAV files
        save_path (string): path to save WAV files
        speaker_dict (dict): the dictionary of utterances of each speaker
            key => speaker
            value => the dictionary of utterance information of each speaker
                key => utterance index
                value => [start_frame, end_frame, transcript]
    """
    # Read each WAV file
    print('==> Reading WAV files...')
    print(speaker_dict.keys())
    for wav_path in tqdm(wav_paths):
        speaker = basename(wav_path).split('.')[0]

        # NOTE: For Switchboard
        speaker = speaker.replace('sw0', 'sw')
        speaker = speaker.replace('sw_', 'sw')
        speaker = speaker.replace('en_', 'en')

        utt_dict = speaker_dict[speaker]
        wav_utt_save_path = mkdir_join(save_path, speaker)

        # Read a wav file
        audio = Audio(file_path=wav_path)
        audio_data = audio.read()

        # Split per utterance & save as wav files
        audio.split(audio_data, utt_dict, speaker, save_path=wav_utt_save_path)
Exemplo n.º 4
0
    def test(self):

        speaker_dict_a, char_set_a, char_capital_set_a, word_count_dict_a = read_trans_fisher(
            label_paths=label_paths_fisher,
            target_speaker='A')
        speaker_dict_b, char_set_b, char_capital_set_b, word_count_dict_b = read_trans_fisher(
            label_paths=label_paths_fisher, target_speaker='B')

        # Meage 2 dictionaries
        speaker_dict_fisher = merge_dicts([speaker_dict_a, speaker_dict_b])
        char_set = char_set_a | char_set_b
        char_capital_set = char_capital_set_a | char_capital_set_b
        word_count_dict_fisher = dict(
            Counter(word_count_dict_a) + Counter(word_count_dict_b))

        self.speaker_dict = read_trans_swbd(
            label_paths=label_paths_swbd,
            run_root_path='../',
            vocab_file_save_path=mkdir_join('../config/vocab_files'),
            save_vocab_file=True,
            speaker_dict_fisher=speaker_dict_fisher,
            char_set=char_set,
            char_capital_set=char_capital_set,
            word_count_dict=word_count_dict_fisher)

        self.check(normalize='global', tool='htk')
        self.check(normalize='speaker', tool='htk')
        self.check(normalize='utterance', tool='htk')
Exemplo n.º 5
0
def main():

    args = parser.parse_args()
    htk_save_path = mkdir(args.htk_save_path)
    path = Path(data_path=args.data_path, config_path='./config')

    # HTK settings
    save_config(audio_file_type='wav',
                feature_type=args.feature_type,
                channels=args.channels,
                config_save_path='./config',
                sampling_rate=16000,
                window=args.window,
                slide=args.slide,
                energy=bool(args.energy),
                delta=bool(args.delta),
                deltadelta=bool(args.deltadelta))

    data_types = ['eval1', 'eval2', 'eval3']

    if bool(args.subset):
        data_types += ['train_subset']
    if bool(args.fullset):
        data_types += ['train_fullset']

    for data_type in data_types:
        wav_paths = path.wav(data_type=data_type)
        save_path = mkdir_join(htk_save_path, data_type)

        with open('./config/wav2htk_' + data_type + '.scp', 'w') as f:
            for wav_path in wav_paths:
                speaker = basename(wav_path).split('.')[0]
                save_path_tmp = join(save_path, speaker + '.htk')
                f.write(wav_path + '  ' + save_path_tmp + '\n')
Exemplo n.º 6
0
    def check(self):

        read_trans(
            label_paths=label_paths,
            word_boundary_paths=wb_paths,
            run_root_path='../',
            vocab_file_save_path=mkdir_join('../config/vocab_files'),
            save_vocab_file=True)
    def test(self):

        self.speaker_dict = read_trans(
            label_paths=label_paths,
            word_boundary_paths=wb_paths,
            run_root_path='../',
            vocab_file_save_path=mkdir_join('../config/vocab_files'),
            save_vocab_file=False)

        self.check(normalize='global', tool='htk')
        self.check(normalize='speaker', tool='htk')
        self.check(normalize='utterance', tool='htk')
Exemplo n.º 8
0
    def check(self):

        for data_type in ['train', 'dev', 'test']:
            save_vocab_file = True if data_type == 'train' else False
            is_test = True if data_type == 'test' else False

            print('---------- %s ----------' % data_type)
            trans_dict = read_phone(
                label_paths=label_paths[data_type],
                vocab_file_save_path=mkdir_join('../config/vocab_files'),
                save_vocab_file=save_vocab_file,
                is_test=is_test)

            print(trans_dict)
Exemplo n.º 9
0
def posterior_test(session, posteriors_op, network, dataset, label_type,
                   rate=1.0):
    """Visualize label posteriors.
    Args:
        session: session of training model
        posteriois_op: operation for computing posteriors
        network: network to evaluate
        dataset: Dataset class
        label_type: phone39 or phone48 or phone61 or character
        rate: rate of evaluation data to use
    """
    save_path = mkdir_join(network.model_dir, 'ctc_output')
    batch_size = 1
    num_examples = dataset.data_num * rate
    iteration = int(num_examples / batch_size)
    if (num_examples / batch_size) != int(num_examples / batch_size):
        iteration += 1

    for step in range(iteration):
        # Create feed dictionary for next mini batch
        inputs, _, seq_len, input_names = dataset.next_batch(
            batch_size=batch_size)

        feed_dict = {
            network.inputs_pl: inputs,
            network.seq_len_pl: seq_len,
            network.keep_prob_input_pl: 1.0,
            network.keep_prob_hidden_pl: 1.0
        }

        # Visualize
        batch_size_each = len(seq_len)
        max_frame_num = inputs.shape[1]
        posteriors = session.run(posteriors_op, feed_dict=feed_dict)
        for i_batch in range(batch_size_each):
            posteriors_index = np.array([i_batch + (batch_size_each * j)
                                         for j in range(max_frame_num)])
            if label_type != 'character':
                probs.plot_probs_ctc_phone(probs=posteriors[posteriors_index][:int(seq_len[i_batch]), :],
                                           save_path=save_path,
                                           wav_index=input_names[i_batch],
                                           data_type=dataset.data_type,
                                           label_type=label_type)
Exemplo n.º 10
0
    def check(self, data_size):

        print('=' * 50)
        print('  data_size: %s' % str(data_size))
        print('=' * 50)

        for data_type in ['train', 'dev_clean', 'dev_other', 'test_clean', 'test_other']:
            if data_type == 'train':
                label_paths = path.trans(data_type='train_' + data_size)
            else:
                label_paths = path.trans(data_type=data_type)
            save_vocab_file = True if data_type == 'train'
            is_test = True if 'test' in data_type else False

            print('---------- %s ----------' % data_type)
            read_trans(
                label_paths=label_paths,
                data_size=data_size,
                vocab_file_save_path=mkdir_join('../config/vocab_files'),
                is_test=is_test,
                data_type=data_type)
Exemplo n.º 11
0
    def check(self, data_size):

        print('=' * 50)
        print('  data_size: %s' % str(data_size))
        print('=' * 50)

        for data_type in ['dev', 'eval1', 'eval2', 'eval3']:
            if data_type == 'train':
                label_paths = path.trans(data_type='train_' + data_size)
            else:
                label_paths = path.trans(data_type=data_type)
            save_vocab_file = True if data_type == 'train'
            is_test = True if 'eval' in data_type else False

            print('---------- %s ----------' % data_type)
            read_sdb(
                label_paths=label_paths,
                data_size=data_size,
                vocab_file_save_path=mkdir_join('../config', 'vocab_files'),
                is_test=is_test,
                data_type=data_type)
Exemplo n.º 12
0
def main():

    args = parser.parse_args()
    htk_save_path = mkdir(args.htk_save_path)

    # HTK settings
    save_config(audio_file_type='wav',
                feature_type=args.feature_type,
                channels=args.channels,
                config_save_path='./config',
                sampling_rate=16000,
                window=args.window,
                slide=args.slide,
                energy=bool(args.energy),
                delta=bool(args.delta),
                deltadelta=bool(args.deltadelta))
    # NOTE: 120-dim features are extracted by default

    parts = [
        'train-clean-100', 'dev-clean', 'dev-other', 'test-clean', 'test-other'
    ]

    if bool(args.large):
        parts += ['train-clean-360', 'train-other-500']
    elif bool(args.medium):
        parts += ['train-clean-360']

    for part in parts:
        # part/speaker/book/*.wav
        wav_paths = [p for p in glob(join(args.data_path, part, '*/*/*.wav'))]
        with open('./config/wav2htk_' + part + '.scp', 'w') as f:
            for wav_path in wav_paths:
                # ex.) wav_path: speaker/book/speaker-book-utt_index.wav
                speaker, book, utt_index = basename(wav_path).split(
                    '.')[0].split('-')
                save_path = mkdir_join(
                    htk_save_path, part, speaker, book,
                    basename(wav_path).split('.')[0] + '.htk')
                f.write(wav_path + '  ' + save_path + '\n')
Exemplo n.º 13
0
def split_wav(wav_paths, save_path, speaker_dict):
    """Read WAV files & divide them with respect to each utterance.
    Args:
        wav_paths (list): path to WAV files
        save_path (string): path to save WAV files
        speaker_dict (dict): the dictionary of utterances of each speaker
            key => speaker
            value => the dictionary of utterance information of each speaker
                key => utterance index
                value => [start_frame, end_frame, transcript]
    """
    # Read each WAV file
    print('==> Reading WAV files...')
    for wav_path in tqdm(wav_paths):
        speaker = basename(wav_path).split('.')[0]

        # NOTE: For Switchboard
        speaker = speaker.replace('sw0', 'sw')
        speaker = speaker.replace('sw_', 'sw')
        speaker = speaker.replace('en_', 'en')

        if 'subject' in speaker:
            speaker = '_'.join(speaker.split('_')[:2]) + '_U'
        elif 'operator' in speaker:
            speaker = '_'.join(speaker.split('_')[:2]) + '_S'

        utt_dict = speaker_dict[speaker]
        wav_utt_save_path = mkdir_join(save_path, speaker)

        # Read a wav file
        audio = Audio(file_path=wav_path)
        audio_data = audio.read()

        # Split per utterance & save as wav files
        audio.split(audio_data, utt_dict, speaker, save_path=wav_utt_save_path)

    # Save the frame number dictionary
    with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
        pickle.dump(audio.frame_num_dict, f)
Exemplo n.º 14
0
def main(data_size):

    speaker_dict_dict = {}  # dict of speaker_dict
    for data_type in ['train', 'eval1', 'eval2', 'eval3']:
        print('=' * 50)
        print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20)
        print('=' * 50)

        ########################################
        # labels
        ########################################
        if data_type == 'train':
            label_paths = path.trans(data_type='train_' + data_size)
        else:
            label_paths = path.trans(data_type=data_type)
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if 'eval' in data_type else False

        print('=> Processing transcripts...')
        speaker_dict_dict[data_type] = read_sdb(
            label_paths=label_paths,
            data_size=data_size,
            vocab_file_save_path=mkdir_join('./config', 'vocab_files'),
            save_vocab_file=save_vocab_file,
            is_test=is_test,
            data_type=data_type)

        ########################################
        # inputs
        ########################################
        print('\n=> Processing input data...')
        input_save_path = mkdir_join(args.feature_save_path, args.save_format,
                                     data_size)
        if isfile(join(input_save_path, data_type, 'complete.txt')):
            print('Already exists.')
        else:
            if args.save_format == 'wav':
                ########################################
                # Split WAV files per utterance
                ########################################
                if data_type == 'train':
                    wav_paths = path.wav(corpus='train' + data_size)
                else:
                    wav_paths = path.wav(corpus=data_type)

                split_wav(wav_paths=wav_paths,
                          speaker_dict=speaker_dict_dict[data_type],
                          save_path=mkdir_join(input_save_path, data_type))
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/utt_name.npy

            elif args.save_format in ['numpy', 'htk']:
                if data_type == 'train':
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type='train_' + data_size)
                    else:
                        audio_paths = path.wav(data_type='train_' + data_size)
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None
                else:
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type=data_type)
                    else:
                        audio_paths = path.wav(data_type=data_type)
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))

                read_audio(audio_paths=audio_paths,
                           speaker_dict=speaker_dict_dict[data_type],
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_std_male=global_std_male,
                           global_mean_female=global_mean_female,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

        ########################################
        # dataset (csv)
        ########################################
        print('\n=> Saving dataset files...')
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_size, data_type)

        df_columns = ['frame_num', 'input_path', 'transcript']
        df_kanji = pd.DataFrame([], columns=df_columns)
        df_kanji_divide = pd.DataFrame([], columns=df_columns)
        df_kana = pd.DataFrame([], columns=df_columns)
        df_kana_divide = pd.DataFrame([], columns=df_columns)
        df_phone = pd.DataFrame([], columns=df_columns)
        df_phone_divide = pd.DataFrame([], columns=df_columns)
        df_word_freq1 = pd.DataFrame([], columns=df_columns)
        df_word_freq5 = pd.DataFrame([], columns=df_columns)
        df_word_freq10 = pd.DataFrame([], columns=df_columns)
        df_word_freq15 = pd.DataFrame([], columns=df_columns)

        with open(join(input_save_path, data_type, 'frame_num.pickle'),
                  'rb') as f:
            frame_num_dict = pickle.load(f)

        utt_count = 0
        df_kanji_list, df_kanji_divide_list = [], []
        df_kana_list, df_kana_divide_list = [], []
        df_phone_list, df_phone_divide_list = [], []
        df_word_freq1_list, df_word_freq5_list = [], []
        df_word_freq10_list, df_word_freq15_list = [], []
        speaker_dict = speaker_dict_dict[data_type]
        for speaker, utt_dict in tqdm(speaker_dict.items()):
            for utt_index, utt_info in utt_dict.items():
                kanji_indices, kanji_divide_indices = utt_info[2:4]
                kana_indices, kana_divide_indices = utt_info[4:6]
                phone_indices, phone_divide_indices = utt_info[6:8]
                word_freq1_indices, word_freq5_indices = utt_info[8:10]
                word_freq10_indices, word_freq15_indices = utt_info[10:12]

                if args.save_format == 'numpy':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.npy')
                elif args.save_format == 'htk':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.htk')
                elif args.save_format == 'wav':
                    input_utt_save_path = path.utt2wav(utt_index)
                else:
                    raise ValueError('save_format is numpy or htk or wav.')

                assert isfile(input_utt_save_path)
                frame_num = frame_num_dict[speaker + '_' + utt_index]

                df_kanji = add_element(
                    df_kanji, [frame_num, input_utt_save_path, kanji_indices])
                df_kanji_divide = add_element(
                    df_kanji_divide,
                    [frame_num, input_utt_save_path, kanji_divide_indices])
                df_kana = add_element(
                    df_kana, [frame_num, input_utt_save_path, kana_indices])
                df_kana_divide = add_element(
                    df_kana_divide,
                    [frame_num, input_utt_save_path, kana_divide_indices])
                df_phone = add_element(
                    df_phone, [frame_num, input_utt_save_path, phone_indices])
                df_phone_divide = add_element(
                    df_phone_divide,
                    [frame_num, input_utt_save_path, phone_divide_indices])
                df_word_freq1 = add_element(
                    df_word_freq1,
                    [frame_num, input_utt_save_path, word_freq1_indices])
                df_word_freq5 = add_element(
                    df_word_freq5,
                    [frame_num, input_utt_save_path, word_freq5_indices])
                df_word_freq10 = add_element(
                    df_word_freq10,
                    [frame_num, input_utt_save_path, word_freq10_indices])
                df_word_freq15 = add_element(
                    df_word_freq15,
                    [frame_num, input_utt_save_path, word_freq15_indices])
                utt_count += 1

                # Reset
                if utt_count == 10000:
                    df_kanji_list.append(df_kanji)
                    df_kanji_divide_list.append(df_kanji_divide)
                    df_kana_list.append(df_kana)
                    df_kana_divide_list.append(df_kana_divide)
                    df_phone_list.append(df_phone)
                    df_phone_divide_list.append(df_phone_divide)
                    df_word_freq1_list.append(df_word_freq1)
                    df_word_freq5_list.append(df_word_freq5)
                    df_word_freq10_list.append(df_word_freq10)
                    df_word_freq15_list.append(df_word_freq15)

                    df_kanji = pd.DataFrame([], columns=df_columns)
                    df_kanji_divide = pd.DataFrame([], columns=df_columns)
                    df_kana = pd.DataFrame([], columns=df_columns)
                    df_kana_divide = pd.DataFrame([], columns=df_columns)
                    df_phone = pd.DataFrame([], columns=df_columns)
                    df_phone_divide = pd.DataFrame([], columns=df_columns)
                    df_word_freq1 = pd.DataFrame([], columns=df_columns)
                    df_word_freq5 = pd.DataFrame([], columns=df_columns)
                    df_word_freq10 = pd.DataFrame([], columns=df_columns)
                    df_word_freq15 = pd.DataFrame([], columns=df_columns)
                    utt_count = 0

        # Last dataframe
        df_kanji_list.append(df_kanji)
        df_kanji_divide_list.append(df_kanji_divide)
        df_kana_list.append(df_kana)
        df_kana_divide_list.append(df_kana_divide)
        df_phone_list.append(df_phone)
        df_phone_divide_list.append(df_phone_divide)
        df_word_freq1_list.append(df_word_freq1)
        df_word_freq5_list.append(df_word_freq5)
        df_word_freq10_list.append(df_word_freq10)
        df_word_freq15_list.append(df_word_freq15)

        # Concatenate all dataframes
        df_kanji = df_kanji_list[0]
        df_kanji_divide = df_kanji_divide_list[0]
        df_kana = df_kana_list[0]
        df_kana_divide = df_kana_divide_list[0]
        df_phone = df_phone_list[0]
        df_phone_divide = df_phone_divide_list[0]
        df_word_freq1 = df_word_freq1_list[0]
        df_word_freq5 = df_word_freq5_list[0]
        df_word_freq10 = df_word_freq10_list[0]
        df_word_freq15 = df_word_freq15_list[0]

        for df_i in df_kanji_list[1:]:
            df_kanji = pd.concat([df_kanji, df_i], axis=0)
        for df_i in df_kanji_divide_list[1:]:
            df_kanji_divide = pd.concat([df_kanji_divide, df_i], axis=0)
        for df_i in df_kana_list[1:]:
            df_kana = pd.concat([df_kana, df_i], axis=0)
        for df_i in df_kana_divide_list[1:]:
            df_kana_divide = pd.concat([df_kana_divide, df_i], axis=0)
        for df_i in df_phone_list[1:]:
            df_phone = pd.concat([df_phone, df_i], axis=0)
        for df_i in df_phone_divide_list[1:]:
            df_phone_divide = pd.concat([df_phone_divide, df_i], axis=0)
        for df_i in df_word_freq1_list[1:]:
            df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0)
        for df_i in df_word_freq5_list[1:]:
            df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0)
        for df_i in df_word_freq10_list[1:]:
            df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0)
        for df_i in df_word_freq15_list[1:]:
            df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0)

        df_kanji.to_csv(join(dataset_save_path, 'kanji.csv'))
        df_kanji_divide.to_csv(join(dataset_save_path, 'kanji_divide.csv'))
        df_kana.to_csv(join(dataset_save_path, 'kana.csv'))
        df_kana_divide.to_csv(join(dataset_save_path, 'kana_divide.csv'))
        df_phone.to_csv(join(dataset_save_path, 'phone.csv'))
        df_phone_divide.to_csv(join(dataset_save_path, 'phone_divide.csv'))
        df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv'))
        df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv'))
        df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv'))
        df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))
Exemplo n.º 15
0
def main():

    for data_type in ['train', 'dev', 'test']:
        print('=' * 50)
        print(' ' * 20 + data_type + ' ' * 20)
        print('=' * 50)

        ########################################
        # inputs
        ########################################
        print('=> Processing input data...')
        if args.save_format in ['numpy', 'htk']:
            input_save_path = mkdir_join(args.feature_save_path,
                                         args.save_format)
            if isfile(join(input_save_path, data_type, 'complete.txt')):
                print('Already exists.')
            else:
                if args.tool == 'htk':
                    audio_paths = path.htk(data_type=data_type)
                else:
                    audio_paths = path.wav(data_type=data_type)

                if data_type != 'train':
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))
                else:
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None

                # Read htk or wav files, and save input data and frame num dict
                read_audio(audio_paths=audio_paths,
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_std_male=global_std_male,
                           global_mean_female=global_mean_female,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # timit/feature/save_format/data_type/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

        ########################################
        # labels (character)
        ########################################
        print('\n=> Processing transcripts (char)...')
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if data_type == 'test' else False
        trans_dict = read_char(label_paths=path.trans(data_type=data_type),
                               vocab_file_save_path=mkdir_join(
                                   './config', 'vocab_files'),
                               save_vocab_file=save_vocab_file,
                               is_test=is_test)

        ########################################
        # dataset (character, csv)
        ########################################
        print('\n=> Saving dataset files (char)...')
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_type)
        df_char = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_char_capital = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        for utt_name, [char_indices,
                       char_indices_capital] in tqdm(trans_dict.items()):
            if args.save_format == 'numpy':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(input_save_path, data_type, speaker,
                                           utt_name + '.npy')
                assert isfile(input_utt_save_path)
                input_utt = np.load(input_utt_save_path)
            elif args.save_format == 'htk':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(input_save_path, data_type, speaker,
                                           utt_name + '.htk')
                assert isfile(input_utt_save_path)
                input_utt, _, _ = read(input_utt_save_path)
            elif args.save_format == 'wav':
                input_utt_save_path = path.utt2wav(utt_name)
                assert isfile(input_utt_save_path)
                input_utt = w2f_psf(input_utt_save_path,
                                    feature_type=CONFIG['feature_type'],
                                    feature_dim=CONFIG['channels'],
                                    use_energy=CONFIG['energy'],
                                    use_delta1=CONFIG['delta'],
                                    use_delta2=CONFIG['deltadelta'],
                                    window=CONFIG['window'],
                                    slide=CONFIG['slide'])
            else:
                raise ValueError('save_format is numpy or htk or wav.')
            frame_num = input_utt.shape[0]
            del input_utt

            series_char = pd.Series(
                [frame_num, input_utt_save_path, char_indices],
                index=df_char.columns)
            series_char_capital = pd.Series(
                [frame_num, input_utt_save_path, char_indices_capital],
                index=df_char_capital.columns)

            df_char = df_char.append(series_char, ignore_index=True)
            df_char_capital = df_char_capital.append(series_char_capital,
                                                     ignore_index=True)

        df_char.to_csv(join(dataset_save_path, 'character.csv'))
        df_char_capital.to_csv(
            join(dataset_save_path, 'character_capital_divide.csv'))

        ########################################
        # labels (phone)
        ########################################
        print('\n=> Processing transcripts (phone)...')
        trans_dict = read_phone(label_paths=path.phone(data_type=data_type),
                                vocab_file_save_path=mkdir_join(
                                    './config', 'vocab_files'),
                                save_vocab_file=save_vocab_file,
                                is_test=is_test)

        ########################################
        # dataset (phone, csv)
        ########################################
        print('\n=> Saving dataset files (phone)...')
        df_phone61 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_phone48 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_phone39 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        for utt_name, [phone61_indices, phone48_indices,
                       phone39_indices] in tqdm(trans_dict.items()):
            if args.save_format == 'numpy':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(input_save_path, data_type, speaker,
                                           utt_name + '.npy')
                assert isfile(input_utt_save_path)
                input_utt = np.load(input_utt_save_path)
            elif args.save_format == 'htk':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(input_save_path, data_type, speaker,
                                           utt_name + '.htk')
                assert isfile(input_utt_save_path)
                input_utt, _, _ = read(input_utt_save_path)
            elif args.save_format == 'wav':
                input_utt_save_path = path.utt2wav(utt_name)
                assert isfile(input_utt_save_path)
                input_utt = w2f_psf(input_utt_save_path,
                                    feature_type=CONFIG['feature_type'],
                                    feature_dim=CONFIG['channels'],
                                    use_energy=CONFIG['energy'],
                                    use_delta1=CONFIG['delta'],
                                    use_delta2=CONFIG['deltadelta'],
                                    window=CONFIG['window'],
                                    slide=CONFIG['slide'])
            else:
                raise ValueError('save_format is numpy or htk or wav.')
            frame_num = input_utt.shape[0]
            del input_utt

            series_phone61 = pd.Series(
                [frame_num, input_utt_save_path, phone61_indices],
                index=df_phone61.columns)
            series_phone48 = pd.Series(
                [frame_num, input_utt_save_path, phone48_indices],
                index=df_phone48.columns)
            series_phone39 = pd.Series(
                [frame_num, input_utt_save_path, phone39_indices],
                index=df_phone39.columns)

            df_phone61 = df_phone61.append(series_phone61, ignore_index=True)
            df_phone48 = df_phone48.append(series_phone48, ignore_index=True)
            df_phone39 = df_phone39.append(series_phone39, ignore_index=True)

        df_phone61.to_csv(join(dataset_save_path, 'phone61.csv'))
        df_phone48.to_csv(join(dataset_save_path, 'phone48.csv'))
        df_phone39.to_csv(join(dataset_save_path, 'phone39.csv'))
Exemplo n.º 16
0
def read_trans(label_paths,
               word_boundary_paths,
               run_root_path,
               vocab_file_save_path,
               save_vocab_file=False,
               speaker_dict_fisher=None,
               char_set=None,
               char_capital_set=None,
               word_count_dict=None):
    """Read transcripts (*_trans.txt) & save files (.npy).
    Args:
        label_paths (list): list of paths to label files
        word_boundary_paths (list): list of paths to word boundary files
        run_root_path (string):
        vocab_file_save_path (string): path to vocabulary files
        save_vocab_file (bool, optional): if True, save vocabulary files
        speaker_dict_fisher (dict):
        char_set (set):
        char_capital_set (set):
        word_count_dict (dict):
    Returns:
        speaker_dict: dictionary of speakers
            key (string) => speaker
            value (dict) => dictionary of utterance infomation of each speaker
                key (string) => utterance index
                value (list) => [start_frame, end_frame, char_indices, char_indices_capital,
                                word_freq1_indices, word_freq5_indices,
                                word_freq10_indices, word_freq15_indices]
    """
    print('=====> Processing target labels...')
    merge_with_fisher = True if speaker_dict_fisher is not None else False

    if merge_with_fisher:
        speaker_dict = speaker_dict_fisher
        vocab_set = set([])
        for word in word_count_dict.keys():
            vocab_set.add(word)
    else:
        speaker_dict = OrderedDict()
        char_set, char_capital_set = set([]), set([])
        word_count_dict = {}
        vocab_set = set([])

    for label_path, wb_path in zip(tqdm(label_paths), word_boundary_paths):
        assert label_path == wb_path.replace('word', 'trans')
        utterance_dict = OrderedDict()
        segmentation_dict = read_segmentation(wb_path)
        with open(label_path, 'r') as f:
            for line in f:
                line = line.strip().lower().split(' ')
                speaker = line[0].split('-')[0]
                # Fix speaker name
                speaker = speaker.replace('sw0', 'sw').replace('a',
                                                               '-A').replace(
                                                                   'b', '-B')
                utt_index = line[0].split('-')[-1]
                start_frame = int(float(line[1]) * 100 + 0.05)
                end_frame = int(float(line[2]) * 100 + 0.05)
                transcript = ' '.join(line[3:])

                if transcript == '[silence]':
                    continue

                # Divide into short utterances
                length_threshold = 700
                if end_frame - start_frame >= length_threshold:
                    word_info_list = segmentation_dict[utt_index]
                    divide_points = []
                    divided_trans = []
                    partial_word_list = []
                    start_frame_tmp = start_frame
                    for i, word_info in enumerate(word_info_list):
                        if word_info[2] != '':
                            partial_word_list.append(word_info[2])
                        if 0 < i < len(word_info_list) - 1 and word_info[
                                2] == '' and word_info[
                                    1] - start_frame_tmp >= length_threshold:
                            divide_points.append(
                                int((word_info[1] + word_info[0]) / 2))
                            divided_trans.append(' '.join(partial_word_list))
                            partial_word_list = []
                            start_frame_tmp = word_info[0]

                    # Last segment
                    if len(partial_word_list) > 0:
                        divided_trans.append(' '.join(partial_word_list))

                    if len(divide_points) > 0:
                        transcript_list = divided_trans
                    else:
                        transcript_list = [transcript]
                else:
                    divide_points = []
                    transcript_list = [transcript]

                for i_trans, trans in enumerate(transcript_list):
                    # Clean transcript
                    trans = fix_transcript(trans)

                    # Convert space to "_"
                    trans = re.sub(r'\s', SPACE, trans)

                    # Skip silence, laughter, noise, vocalized-noise
                    if trans.replace(NOISE, '').replace(LAUGHTER, '').replace(
                            VOCALIZED_NOISE, '').replace(SPACE, '') == '':
                        continue

                    # Remove the first and last space
                    if trans[0] == SPACE:
                        trans = trans[1:]
                    if trans[-1] == SPACE:
                        trans = trans[:-1]

                    # Count words
                    for word in trans.split(SPACE):
                        vocab_set.add(word)
                        if word not in word_count_dict.keys():
                            word_count_dict[word] = 0
                        word_count_dict[word] += 1

                    # Capital-divided
                    trans_capital = ''
                    for word in trans.split(SPACE):
                        if len(word) == 1:
                            char_capital_set.add(word)
                            trans_capital += word
                        else:
                            # Replace the first character with the capital
                            # letter
                            word = word[0].upper() + word[1:]

                            # Check double-letters
                            for i in range(0, len(word) - 1, 1):
                                if word[i:i + 2] in DOUBLE_LETTERS:
                                    char_capital_set.add(word[i:i + 2])
                                else:
                                    char_capital_set.add(word[i])
                            trans_capital += word

                    for c in list(trans):
                        char_set.add(c)

                    if len(transcript_list) == 1:
                        utterance_dict[utt_index.zfill(4)] = [
                            start_frame, end_frame, trans
                        ]
                    else:
                        assert len(transcript_list) - 1 == len(divide_points)
                        if i_trans == 0:
                            assert start_frame < divide_points[i_trans] - 1
                            utterance_dict[utt_index.zfill(4) + '-' +
                                           str(i_trans + 1)] = [
                                               start_frame,
                                               divide_points[0] - 1, trans
                                           ]
                        elif i_trans == len(transcript_list) - 1:
                            assert start_frame < end_frame
                            utterance_dict[utt_index.zfill(4) + '-' +
                                           str(i_trans + 1)] = [
                                               divide_points[-1], end_frame,
                                               trans
                                           ]
                        else:
                            assert divide_points[
                                i_trans - 1] < divide_points[i_trans] - 1
                            utterance_dict[utt_index.zfill(4) + '-' +
                                           str(i_trans + 1)] = [
                                               divide_points[i_trans - 1],
                                               divide_points[i_trans] - 1,
                                               trans
                                           ]

                    # for debug
                    # print(transcript_original)
                    # print(trans)
                    # print(trans_capital)

            speaker_dict[speaker] = utterance_dict

    # Make vocabulary files
    data_size = '2000h' if merge_with_fisher else '300h'
    char_vocab_file_path = mkdir_join(vocab_file_save_path,
                                      'character_' + data_size + '.txt')
    char_capital_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'character_capital_divide_' + data_size + '.txt')
    word_freq1_vocab_file_path = mkdir_join(vocab_file_save_path,
                                            'word_freq1_' + data_size + '.txt')
    word_freq5_vocab_file_path = mkdir_join(vocab_file_save_path,
                                            'word_freq5_' + data_size + '.txt')
    word_freq10_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'word_freq10_' + data_size + '.txt')
    word_freq15_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'word_freq15_' + data_size + '.txt')

    # Reserve some indices
    for mark in [SPACE, HYPHEN, APOSTROPHE, LAUGHTER, NOISE, VOCALIZED_NOISE]:
        for c in list(mark):
            char_set.discard(c)
    for mark in [SPACE, HYPHEN, APOSTROPHE]:
        for c in list(mark):
            char_capital_set.discard(c)

    # for debug
    # print(sorted(list(char_set)))
    # print(sorted(list(char_capital_set)))

    if save_vocab_file:
        # character-level
        with open(char_vocab_file_path, 'w') as f:
            char_list = sorted(list(char_set)) + \
                [SPACE, APOSTROPHE, HYPHEN, LAUGHTER, NOISE, VOCALIZED_NOISE]
            for char in char_list:
                f.write('%s\n' % char)

        # character-level (capital-divided)
        with open(char_capital_vocab_file_path, 'w') as f:
            char_capital_list = sorted(list(char_capital_set)) + \
                [APOSTROPHE, HYPHEN, LAUGHTER, NOISE, VOCALIZED_NOISE]
            for char in char_capital_list:
                f.write('%s\n' % char)

        # word-level (threshold == 1)
        with open(word_freq1_vocab_file_path, 'w') as f:
            vocab_list = sorted(list(vocab_set)) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 5)
        with open(word_freq5_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 5
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 10)
        with open(word_freq10_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 10
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 15)
        with open(word_freq15_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 15
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

    # Tokenize
    print('=====> Tokenize...')
    char2idx = Char2idx(char_vocab_file_path, double_letter=True)
    char2idx_capital = Char2idx(char_capital_vocab_file_path,
                                capital_divide=True)
    word2idx_freq1 = Word2idx(word_freq1_vocab_file_path)
    word2idx_freq5 = Word2idx(word_freq5_vocab_file_path)
    word2idx_freq10 = Word2idx(word_freq10_vocab_file_path)
    word2idx_freq15 = Word2idx(word_freq15_vocab_file_path)
    for speaker, utt_dict in tqdm(speaker_dict.items()):
        for utt_index, [start_frame, end_frame,
                        transcript] in utt_dict.items():
            char_indices = char2idx(transcript)
            char_indices_capital = char2idx_capital(transcript)
            word_freq1_indices = word2idx_freq1(transcript)
            word_freq5_indices = word2idx_freq5(transcript)
            word_freq10_indices = word2idx_freq10(transcript)
            word_freq15_indices = word2idx_freq15(transcript)

            char_indices = ' '.join(list(map(str, char_indices.tolist())))
            char_indices_capital = ' '.join(
                list(map(str, char_indices_capital.tolist())))
            word_freq1_indices = ' '.join(
                list(map(str, word_freq1_indices.tolist())))
            word_freq5_indices = ' '.join(
                list(map(str, word_freq5_indices.tolist())))
            word_freq10_indices = ' '.join(
                list(map(str, word_freq10_indices.tolist())))
            word_freq15_indices = ' '.join(
                list(map(str, word_freq15_indices.tolist())))

            utt_dict[utt_index] = [
                start_frame, end_frame, char_indices, char_indices_capital,
                word_freq1_indices, word_freq5_indices, word_freq10_indices,
                word_freq15_indices
            ]
        speaker_dict[speaker] = utt_dict

    return speaker_dict
Exemplo n.º 17
0
def main(config_path):

    # Read a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        corpus = config['corpus']
        feature = config['feature']
        param = config['param']

    # TODO: Solve conflict (batch_norm & layer norm)
    if corpus['label_type'] == 'phone61':
        output_size = 61
    elif corpus['label_type'] == 'phone48':
        output_size = 48
    elif corpus['label_type'] == 'phone39':
        output_size = 39
    elif corpus['label_type'] == 'character':
        output_size = 30

    # Model setting
    CTCModel = load(model_type=config['model_name'])
    network = CTCModel(batch_size=param['batch_size'],
                       input_size=feature['input_size'] * feature['num_stack'],
                       num_cell=param['num_cell'],
                       num_layer=param['num_layer'],
                       output_size=output_size,
                       clip_gradients=param['clip_grad'],
                       clip_activation=param['clip_activation'],
                       dropout_ratio_input=param['dropout_input'],
                       dropout_ratio_hidden=param['dropout_hidden'],
                       num_proj=param['num_proj'],
                       weight_decay=param['weight_decay'])

    network.model_name = config['model_name'].upper()
    network.model_name += '_' + str(param['num_cell'])
    network.model_name += '_' + str(param['num_layer'])
    network.model_name += '_' + param['optimizer']
    network.model_name += '_lr' + str(param['learning_rate'])
    if param['num_proj'] != 0:
        network.model_name += '_proj' + str(param['num_proj'])
    if feature['num_stack'] != 1:
        network.model_name += '_stack' + str(feature['num_stack'])
    if param['weight_decay'] != 0:
        network.model_name += '_weightdecay' + str(param['weight_decay'])

    # Set save path
    network.model_dir = mkdir('/n/sd8/inaguma/result/timit/ctc/')
    network.model_dir = mkdir_join(network.model_dir, corpus['label_type'])
    network.model_dir = mkdir_join(network.model_dir, network.model_name)

    # Reset model directory
    if not isfile(join(network.model_dir, 'complete.txt')):
        tf.gfile.DeleteRecursively(network.model_dir)
        tf.gfile.MakeDirs(network.model_dir)
    else:
        raise ValueError('File exists.')

    # Set process name
    setproctitle('ctc_timit_' +
                 corpus['label_type'] + '_' + param['optimizer'])

    # Save config file
    shutil.copyfile(config_path, join(network.model_dir, 'config.yml'))

    sys.stdout = open(join(network.model_dir, 'train.log'), 'w')
    print(network.model_name)
    do_train(network=network,
             optimizer=param['optimizer'],
             learning_rate=param['learning_rate'],
             batch_size=param['batch_size'],
             epoch_num=param['num_epoch'],
             label_type=corpus['label_type'],
             num_stack=feature['num_stack'],
             num_skip=feature['num_skip'])
    sys.stdout = sys.__stdout__
Exemplo n.º 18
0
def main(data_size):

    speaker_dict_dict = {}  # dict of speaker_dict
    for data_type in ['train', 'eval1', 'eval2', 'eval3']:
        print('=' * 50)
        print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20)
        print('=' * 50)

        ########################################
        # labels
        ########################################
        if data_type == 'train':
            label_paths = path.trans(data_type='train_' + data_size)
        else:
            label_paths = path.trans(data_type=data_type)
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if 'eval' in data_type else False

        print('=> Processing transcripts...')
        speaker_dict_dict[data_type] = read_sdb(
            label_paths=label_paths,
            data_size=data_size,
            vocab_file_save_path=mkdir_join('./config', 'vocab_files'),
            save_vocab_file=save_vocab_file,
            is_test=is_test,
            data_type=data_type)

        ########################################
        # inputs
        ########################################
        print('\n=> Processing input data...')
        input_save_path = mkdir_join(args.feature_save_path, args.save_format,
                                     data_size)
        if isfile(join(input_save_path, data_type, 'complete.txt')):
            print('Already exists.')
        else:
            if args.save_format == 'wav':
                ########################################
                # Split WAV files per utterance
                ########################################
                if data_type == 'train':
                    wav_paths = path.wav(corpus='train' + data_size)
                else:
                    wav_paths = path.wav(corpus=data_type)

                split_wav(wav_paths=wav_paths,
                          speaker_dict=speaker_dict_dict[data_type],
                          save_path=mkdir_join(input_save_path, data_type))
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/utt_name.npy

            elif args.save_format in ['numpy', 'htk']:
                if data_type == 'train':
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type='train_' + data_size)
                    else:
                        audio_paths = path.wav(data_type='train_' + data_size)
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None
                else:
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type=data_type)
                    else:
                        audio_paths = path.wav(data_type=data_type)
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))

                read_audio(audio_paths=audio_paths,
                           speaker_dict=speaker_dict_dict[data_type],
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_std_male=global_std_male,
                           global_mean_female=global_mean_female,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # csj/feature/save_format/data_size/data_type/speaker/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

    ########################################
    # dataset (csv)
    ########################################
    print('\n=> Saving dataset files...')
    for data_type in ['train', 'eval1', 'eval2', 'eval3']:
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_size, data_type)

        print('---------- %s ----------' % data_type)
        df_kanji = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_kana = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_phone = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])

        utt_count = 0
        df_kanji_list, df_kana_list, df_phone_list = [], [], []
        for speaker, utt_dict in tqdm(speaker_dict_dict[data_type].items()):
            for utt_index, utt_info in utt_dict.items():
                trans_kanji, trans_kana, trans_phone = utt_info[2:]
                if args.save_format == 'numpy':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.npy')
                    assert isfile(input_utt_save_path)
                    input_utt = np.load(input_utt_save_path)
                elif args.save_format == 'htk':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker,
                        speaker + '_' + utt_index + '.htk')
                    assert isfile(input_utt_save_path)
                    input_utt, _, _ = read(input_utt_save_path)
                elif args.save_format == 'wav':
                    input_utt_save_path = path.utt2wav(utt_index)
                    assert isfile(input_utt_save_path)
                    input_utt = w2f_psf(input_utt_save_path,
                                        feature_type=CONFIG['feature_type'],
                                        feature_dim=CONFIG['channels'],
                                        use_energy=CONFIG['energy'],
                                        use_delta1=CONFIG['delta'],
                                        use_delta2=CONFIG['deltadelta'],
                                        window=CONFIG['window'],
                                        slide=CONFIG['slide'])
                else:
                    raise ValueError('save_format is numpy or htk or wav.')
                frame_num = input_utt.shape[0]
                del input_utt

                series_kanji = pd.Series(
                    [frame_num, input_utt_save_path, trans_kanji],
                    index=df_kanji.columns)
                series_kana = pd.Series(
                    [frame_num, input_utt_save_path, trans_kana],
                    index=df_kana.columns)
                series_phone = pd.Series(
                    [frame_num, input_utt_save_path, trans_phone],
                    index=df_phone.columns)

                df_kanji = df_kanji.append(series_kanji, ignore_index=True)
                df_kana = df_kana.append(series_kana, ignore_index=True)
                df_phone = df_phone.append(series_phone, ignore_index=True)

                utt_count += 1

                # Reset
                if utt_count == 50000:
                    df_kanji_list.append(df_kanji)
                    df_kana_list.append(df_kana)
                    df_phone_list.append(df_phone)
                    df_kanji = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_kana = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_phone = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    utt_count = 0

            # Last dataframe
            df_kanji_list.append(df_kanji)
            df_kana_list.append(df_kana)
            df_phone_list.append(df_phone)

            # Concatenate all dataframes
            df_kanji = df_kanji_list[0]
            df_kana = df_kana_list[0]
            df_phone = df_phone_list[0]
            for df_i in df_kanji_list[1:]:
                df_kanji = pd.concat([df_kanji, df_i], axis=0)
            for df_i in df_kana_list[1:]:
                df_kana = pd.concat([df_kana, df_i], axis=0)
            for df_i in df_phone_list[1:]:
                df_phone = pd.concat([df_phone, df_i], axis=0)

        df_kanji.to_csv(join(dataset_save_path, 'dataset_kanji.csv'))
        df_kana.to_csv(join(dataset_save_path, 'dataset_kana.csv'))
        df_phone.to_csv(join(dataset_save_path, 'dataset_phone.csv'))

        # Use the first 4000 utterances as the dev set
        if data_type == 'train':
            df_kanji[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_kanji.csv'))
            df_kana[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_kana.csv'))
            df_phone[:4000].to_csv(
                mkdir_join(args.dataset_save_path, args.save_format, data_size,
                           'dev', 'dataset_phone.csv'))
Exemplo n.º 19
0
def read_audio(audio_paths,
               tool,
               config,
               normalize,
               is_training,
               speaker_gender_dict,
               save_path=None,
               save_format=None,
               global_mean_male=None,
               global_mean_female=None,
               global_std_male=None,
               global_std_female=None,
               dtype=np.float32):
    """Read audio files.
    Args:
        audio_paths (list): paths to HTK or WAV files
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & std over
                      the training set per gender
            speaker => normalize input features by mean & std per speaker
            utterance => normalize input features by mean & std per utterancet
                         data by mean & std per utterance
        is_training (bool): Set True if save as training set
        speaker_gender_dict (dict): A dictionary of speakers' gender information
            key (string) => speaker
            value (string) => F or M
        save_path (string): path to save npy files
        save_format (string, optional): numpy as htk
        global_mean_male (np.ndarray, optional): global mean of male over
            the training set
        global_std_male (np.ndarray, optional): global standard deviation
            of male over the training set
        global_mean_female (np.ndarray, optional): global mean of female
            over the training set
        global_std_female (np.ndarray, optional): global standard
            deviation of female over the training set
        dtype (optional): the type of data, default is np.float32
    Returns:
        global_mean_male (np.ndarray): global mean of male over the
            training set
        global_std_male (np.ndarray): global standard deviation of male
            over the training set
        global_mean_female (np.ndarray): global mean of female over the
            training set
        global_std_female (np.ndarray): global standard deviation of
            female over the training set
        frame_num_dict (dict):
            key => utterance name
            value => the number of frames
    """
    if not is_training:
        if global_mean_male is None or global_std_male is None:
            raise ValueError('Set mean & std computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')
    if tool not in ['htk', 'python_speech_features', 'librosa']:
        raise TypeError('tool must be "htk" or "python_speech_features"' +
                        ' or "librosa".')

    audio_path_dict = {}
    audio_path_list_male, audio_path_list_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict, speaker_std_dict = {}, {}

    # Loop 1: Divide all audio paths into speakers
    print('=====> Reading audio files...')
    for i, audio_path in enumerate(tqdm(audio_paths)):
        # ex.) audio_path: speaker-book-utt_index.***
        speaker, book, utt_index = basename(audio_path).split('.')[0].split(
            '-')
        if speaker not in audio_path_dict.keys():
            audio_path_dict[speaker] = []
        audio_path_dict[speaker].append(audio_path)

        if is_training:
            # Read each audio file
            if tool == 'htk':
                input_utt, sampPeriod, parmKind = read(audio_path)
            elif tool == 'python_speech_features':
                input_utt = w2f_psf(audio_path,
                                    feature_type=config['feature_type'],
                                    feature_dim=config['channels'],
                                    use_energy=config['energy'],
                                    use_delta1=config['delta'],
                                    use_delta2=config['deltadelta'],
                                    window=config['window'],
                                    slide=config['slide'])
            elif tool == 'librosa':
                input_utt = w2f_librosa(audio_path,
                                        feature_type=config['feature_type'],
                                        feature_dim=config['channels'],
                                        use_energy=config['energy'],
                                        use_delta1=config['delta'],
                                        use_delta2=config['deltadelta'],
                                        window=config['window'],
                                        slide=config['slide'])

            input_utt_sum = np.sum(input_utt, axis=0)

            if i == 0:
                # Initialize global statistics
                feature_dim = input_utt.shape[1]
                global_mean_male = np.zeros((feature_dim, ), dtype=dtype)
                global_mean_female = np.zeros((feature_dim, ), dtype=dtype)
                global_std_male = np.zeros((feature_dim, ), dtype=dtype)
                global_std_female = np.zeros((feature_dim, ), dtype=dtype)

            # For computing global mean
            if speaker_gender_dict[speaker] == 'M':
                audio_path_list_male.append(input_utt)
                global_mean_male += input_utt_sum
                total_frame_num_male += input_utt.shape[0]
            elif speaker_gender_dict[speaker] == 'F':
                audio_path_list_female.append(input_utt)
                global_mean_female += input_utt_sum
                total_frame_num_female += input_utt.shape[0]
            else:
                raise ValueError('gender is M or F.')

            # For computing speaker mean
            if normalize == 'speaker':
                if speaker not in total_frame_num_dict.keys():
                    total_frame_num_dict[speaker] = 0
                    # Initialize speaker statistics
                    speaker_mean_dict[speaker] = np.zeros((feature_dim, ),
                                                          dtype=dtype)
                    speaker_std_dict[speaker] = np.zeros((feature_dim, ),
                                                         dtype=dtype)
                speaker_mean_dict[speaker] += input_utt_sum
                total_frame_num_dict[speaker] += input_utt.shape[0]

    # Loop 2: Computing global mean and sttdev
    if is_training and normalize != 'no':
        print('=====> Computing global mean & stddev...')
        # Compute global mean per gender
        global_mean_male /= total_frame_num_male
        global_mean_female /= total_frame_num_female

        for speaker, audio_paths_speaker in tqdm(audio_path_dict.items()):
            if normalize == 'speaker':
                # Compute speaker mean
                speaker_mean_dict[speaker] /= total_frame_num_dict[speaker]

            for audio_path in audio_paths_speaker:
                speaker, book, utt_index = basename(audio_path).split(
                    '.')[0].split('-')

                # Read each audio file
                if tool == 'htk':
                    input_utt, sampPeriod, parmKind = read(audio_path)
                elif tool == 'python_speech_features':
                    input_utt = w2f_psf(audio_path,
                                        feature_type=config['feature_type'],
                                        feature_dim=config['channels'],
                                        use_energy=config['energy'],
                                        use_delta1=config['delta'],
                                        use_delta2=config['deltadelta'],
                                        window=config['window'],
                                        slide=config['slide'])
                elif tool == 'librosa':
                    input_utt = w2f_librosa(
                        audio_path,
                        feature_type=config['feature_type'],
                        feature_dim=config['channels'],
                        use_energy=config['energy'],
                        use_delta1=config['delta'],
                        use_delta2=config['deltadelta'],
                        window=config['window'],
                        slide=config['slide'])

                # For computing global stddev
                if speaker_gender_dict[speaker] == 'M':
                    global_std_male += np.sum(np.abs(input_utt -
                                                     global_mean_male)**2,
                                              axis=0)
                elif speaker_gender_dict[speaker] == 'F':
                    global_std_female += np.sum(np.abs(input_utt -
                                                       global_mean_female)**2,
                                                axis=0)
                else:
                    raise ValueError('gender is M or F.')

                if normalize == 'speaker':
                    # For computing speaker stddev
                    speaker_std_dict[speaker] += np.sum(
                        np.abs(input_utt - speaker_mean_dict[speaker])**2,
                        axis=0)

            if normalize == 'speaker':
                # Compute speaker stddev
                speaker_std_dict[speaker] = np.sqrt(
                    speaker_std_dict[speaker] /
                    (total_frame_num_dict[speaker] - 1))

        # Compute global stddev per gender
        global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1))
        global_std_female = np.sqrt(global_std_female /
                                    (total_frame_num_female - 1))

        if save_path is not None:
            # Save global mean & std per gender
            np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
            np.save(join(save_path, 'global_mean_female.npy'),
                    global_mean_female)
            np.save(join(save_path, 'global_std_male.npy'), global_std_male)
            np.save(join(save_path, 'global_std_female.npy'),
                    global_std_female)

    # Loop 3: Normalization and Saving
    print('=====> Normalization...')
    frame_num_dict = {}
    for speaker, audio_paths_speaker in tqdm(audio_path_dict.items()):
        for audio_path in audio_paths_speaker:
            speaker, book, utt_index = basename(audio_path).split(
                '.')[0].split('-')

            # Read each audio file
            if tool == 'htk':
                input_utt, sampPeriod, parmKind = read(audio_path)
            elif tool == 'python_speech_features':
                input_utt = w2f_psf(audio_path,
                                    feature_type=config['feature_type'],
                                    feature_dim=config['channels'],
                                    use_energy=config['energy'],
                                    use_delta1=config['delta'],
                                    use_delta2=config['deltadelta'],
                                    window=config['window'],
                                    slide=config['slide'])
            elif tool == 'librosa':
                input_utt = w2f_librosa(audio_path,
                                        feature_type=config['feature_type'],
                                        feature_dim=config['channels'],
                                        use_energy=config['energy'],
                                        use_delta1=config['delta'],
                                        use_delta2=config['deltadelta'],
                                        window=config['window'],
                                        slide=config['slide'])

            if normalize == 'no':
                pass
            elif normalize == 'global' or not is_training:
                # Normalize by mean & std over the training set per gender
                if speaker_gender_dict[speaker] == 'M':
                    input_utt -= global_mean_male
                    input_utt /= global_std_male
                elif speaker_gender_dict[speaker] == 'F':
                    input_utt -= global_mean_female
                    input_utt /= global_std_female
                else:
                    raise ValueError('gender is M or F.')
            elif normalize == 'speaker':
                # Normalize by mean & std per speaker
                input_utt -= speaker_mean_dict[speaker]
                input_utt /= speaker_std_dict[speaker]
            elif normalize == 'utterance':
                # Normalize by mean & std per utterance
                utt_mean = np.mean(input_utt, axis=0, dtype=dtype)
                utt_std = np.std(input_utt, axis=0, dtype=dtype)
                input_utt = (input_utt - utt_mean) / utt_std
            else:
                raise ValueError

            frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0]

            if save_path is not None:
                # Save input features
                input_name = basename(audio_path).split('.')[0]
                if save_format == 'numpy':
                    input_data_save_path = mkdir_join(save_path, speaker,
                                                      input_name + '.npy')
                    np.save(input_data_save_path, input_utt)
                elif save_format == 'htk':
                    write(input_utt,
                          htk_path=mkdir_join(save_path, speaker,
                                              input_name + '.htk'),
                          sampPeriod=sampPeriod,
                          parmKind=parmKind)
                else:
                    raise ValueError('save_format is numpy or htk.')

    if save_path is not None:
        # Save the frame number dictionary
        with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
            pickle.dump(frame_num_dict, f)

    return (global_mean_male, global_mean_female, global_std_male,
            global_std_female, frame_num_dict)
Exemplo n.º 20
0
def read_audio(audio_paths,
               speaker_dict,
               tool,
               config,
               normalize,
               is_training,
               save_path=None,
               save_format=None,
               global_mean=None,
               global_std=None,
               dtype=np.float32):
    """Read HTK or WAV files.
    Args:
        audio_paths (list): paths to HTK or WAV files
        speaker_dict (dict): A dictionary of speakers' gender information
            key (string) => speaker
            value (dict) => dictionary of utterance information of each speaker
                key (string) => utterance index
                value (list) => [start_frame, end_frame, transcript]
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & std over
                      the training set per gender
            speaker => normalize input features by mean & std per speaker
            utterance => normalize input features by mean & std per utterancet
                         data by mean & std per utterance
        is_training (bool): training or not
        save_path (string): path to save npy files
        save_format (string, optional): numpy as htk
        global_mean (np.ndarray, optional): global mean over the training set
        global_std (np.ndarray, optional): global standard deviation over the
            training set
        dtype (optional): the type of data, default is np.float32
    Returns:
        global_mean (np.ndarray): global mean over the training set
        global_std (np.ndarray): global standard deviation over the
            training set
        frame_num_dict (dict):
            key => utterance name
            value => the number of frames
    """
    if not is_training:
        if global_mean is None or global_std is None:
            raise ValueError('Set mean & std computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')

    total_frame_num = 0
    total_frame_num_dict = {}
    speaker_mean_dict = {}

    # Loop 1: Computing global mean and statistics
    if is_training and normalize != 'no':
        print('=====> Reading audio files...')
        for i, audio_path in enumerate(tqdm(audio_paths)):
            speaker = basename(audio_path).split('.')[0]

            # Fix speaker name
            speaker = speaker.replace('sw0', 'sw')
            # ex.) sw04771-A => sw4771-A (LDC97S62)
            speaker = speaker.replace('sw_', 'sw')
            # ex.) sw_4771-A => sw4771-A (eval2000, swbd)
            speaker = speaker.replace('en_', 'en')
            # ex.) en_4156-A => en4156-A (eval2000, ch)

            # Divide each audio file into utterances
            _, input_utt_sum, speaker_mean, _, total_frame_num_speaker = segment(
                audio_path,
                speaker,
                speaker_dict[speaker],
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            if i == 0:
                # Initialize global statistics
                feature_dim = input_utt_sum.shape[0]
                global_mean = np.zeros((feature_dim, ), dtype=dtype)
                global_std = np.zeros((feature_dim, ), dtype=dtype)

            global_mean += input_utt_sum
            total_frame_num += total_frame_num_speaker

            # For computing speaker stddev
            if normalize == 'speaker':
                speaker_mean_dict[speaker] = speaker_mean
                total_frame_num_dict[speaker] = total_frame_num_speaker
                # NOTE: speaker mean is already computed

        print('=====> Computing global mean & stddev...')
        # Compute global mean
        global_mean /= total_frame_num

        for audio_path in tqdm(audio_paths):
            speaker = basename(audio_path).split('.')[0]

            # Normalize speaker name
            speaker = speaker.replace('sw0', 'sw')
            speaker = speaker.replace('sw_', 'sw')
            speaker = speaker.replace('en_', 'en')

            # Divide each audio into utterances
            input_data_dict_speaker, _, _, _, _ = segment(
                audio_path,
                speaker,
                speaker_dict[speaker],
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            # For computing global stddev
            for input_utt in input_data_dict_speaker.values():
                global_std += np.sum(np.abs(input_utt - global_mean)**2,
                                     axis=0)

        # Compute global stddev
        global_std = np.sqrt(global_std / (total_frame_num - 1))

        if save_path is not None:
            # Save global mean & std per gender
            np.save(join(save_path, 'global_mean.npy'), global_mean)
            np.save(join(save_path, 'global_std.npy'), global_std)

    # Loop 2: Normalization and Saving
    print('=====> Normalization...')
    frame_num_dict = {}
    sampPeriod, parmKind = None, None
    for audio_path in tqdm(audio_paths):
        speaker = basename(audio_path).split('.')[0]

        # Normalize speaker name
        speaker = speaker.replace('sw0', 'sw')
        speaker = speaker.replace('sw_', 'sw')
        speaker = speaker.replace('en_', 'en')

        if normalize == 'speaker' and is_training:
            speaker_mean = speaker_mean_dict[speaker]
        else:
            speaker_mean = None

        # Divide each audio into utterances
        input_data_dict_speaker, _, speaker_mean, speaker_std, _ = segment(
            audio_path,
            speaker,
            speaker_dict[speaker],
            is_training=is_training,
            sil_duration=0,
            tool=tool,
            config=config,
            mean=speaker_mean)  # for compute speaker sttdev
        # NOTE: input_data_dict_speaker have been not normalized yet

        for utt_index, input_utt in input_data_dict_speaker.items():

            if normalize == 'no':
                pass
            elif normalize == 'global' or not is_training:
                # Normalize by mean & std over the training set
                input_utt -= global_mean
                input_utt /= global_std
            elif normalize == 'speaker':
                # Normalize by mean & std per speaker
                input_utt = (input_utt - speaker_mean) / speaker_std
            elif normalize == 'utterance':
                # Normalize by mean & std per utterance
                utt_mean = np.mean(input_utt, axis=0, dtype=dtype)
                utt_std = np.std(input_utt, axis=0, dtype=dtype)
                input_utt = (input_utt - utt_mean) / utt_std
            else:
                ValueError

            frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0]

            if save_path is not None:
                # Save input features
                if save_format == 'numpy':
                    input_data_save_path = mkdir_join(
                        save_path, speaker, speaker + '_' + utt_index + '.npy')
                    np.save(input_data_save_path, input_utt)
                elif save_format == 'htk':
                    if sampPeriod is None:
                        _, sampPeriod, parmKind = read(audio_path)
                    write(input_utt,
                          htk_path=mkdir_join(
                              save_path, speaker,
                              speaker + '_' + utt_index + '.htk'),
                          sampPeriod=sampPeriod,
                          parmKind=parmKind)
                else:
                    raise ValueError('save_format is numpy or htk.')

    if save_path is not None:
        # Save the frame number dictionary
        with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
            pickle.dump(frame_num_dict, f)

    return global_mean, global_std, frame_num_dict
Exemplo n.º 21
0
def read_trans(label_paths,
               data_size,
               vocab_file_save_path,
               is_test=False,
               save_vocab_file=False,
               data_type=None):
    """Read transcript.
    Args:
        label_paths (list): list of paths to label files
        data_size (string): 100h or 460h or 960h
        vocab_file_save_path (string): path to vocabulary files
        is_test (bool, optional): if True, compute OOV rate
        save_vocab_file (bool, optional): if True, save vocabulary files
        data_type (string, optional): test_clean or test_other
    Returns:
        trans_dict (dict):
            key (string) => speaker-book-utt_index
            value (list) => [char_indices, char_indices_capital,
                            word_freq1_indices, word_freq5_indices,
                            word_freq10_indices, word_freq15_indices]
    """
    print('=====> Reading target labels...')
    speaker_dict = {}
    char_set, char_capital_set = set([]), set([])
    word_count_dict = {}
    vocab_set = set([])
    for label_path in tqdm(label_paths):
        speaker = label_path.split('/')[-3]
        if speaker not in speaker_dict.keys():
            speaker_dict[speaker] = {}
        with open(label_path, 'r') as f:
            for line in f:
                line = line.strip().lower().split(' ')
                utt_name = line[0]  # ex.) speaker-book-utt_index
                transcript = ' '.join(line[1:])
                word_list = line[1:]

                # Count words
                for word in word_list:
                    vocab_set.add(word)
                    if word not in word_count_dict.keys():
                        word_count_dict[word] = 0
                    word_count_dict[word] += 1

                # Capital-divided
                for word in transcript.split(' '):
                    if len(word) == 1:
                        char_capital_set.add(word.upper())
                    else:
                        # Replace the first character with the capital letter
                        word = word[0].upper() + word[1:]
                        char_capital_set.add(word[0].upper())

                        # Check double-letters
                        skip_flag = False
                        for i in range(1, len(word) - 1, 1):
                            if skip_flag:
                                skip_flag = False
                                continue

                            if not skip_flag and word[i:i +
                                                      2] in DOUBLE_LETTERS:
                                char_capital_set.add(word[i:i + 2])
                                skip_flag = True
                            else:
                                char_capital_set.add(word[i])

                        # Final character
                        if not skip_flag:
                            char_capital_set.add(word[-1])

                # Convert space to "_"
                transcript = re.sub(r'\s', SPACE, transcript)

                for c in list(transcript):
                    char_set.add(c)

                speaker_dict[speaker][utt_name] = transcript

                # for debug
                # print(transcript)
                # print(transcript_capital_divide)
                # print('-----')

    # Make vocabulary files
    char_vocab_file_path = mkdir_join(vocab_file_save_path,
                                      'character_' + data_size + '.txt')
    char_capital_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'character_capital_divide_' + data_size + '.txt')
    word_freq1_vocab_file_path = mkdir_join(vocab_file_save_path,
                                            'word_freq1_' + data_size + '.txt')
    word_freq5_vocab_file_path = mkdir_join(vocab_file_save_path,
                                            'word_freq5_' + data_size + '.txt')
    word_freq10_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'word_freq10_' + data_size + '.txt')
    word_freq15_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'word_freq15_' + data_size + '.txt')

    # Reserve some indices
    char_set.discard(SPACE)
    char_set.discard(APOSTROPHE)
    char_capital_set.discard(APOSTROPHE)

    # for debug
    # print(sorted(list(char_set)))
    # print(sorted(list(char_capital_set)))

    if save_vocab_file:
        # character-level
        with open(char_vocab_file_path, 'w') as f:
            char_list = sorted(list(char_set)) + [SPACE, APOSTROPHE]
            for char in char_list:
                f.write('%s\n' % char)

        # character-level (capital-divided)
        with open(char_capital_vocab_file_path, 'w') as f:
            char_list = sorted(list(char_capital_set)) + [APOSTROPHE]
            for char in char_list:
                f.write('%s\n' % char)

        # word-level (threshold == 1)
        with open(word_freq1_vocab_file_path, 'w') as f:
            vocab_list = sorted(list(vocab_set)) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 5)
        with open(word_freq5_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 5
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 10)
        with open(word_freq10_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 10
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 15)
        with open(word_freq15_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 15
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

    # Compute OOV rate
    if is_test:
        with open(
                join(vocab_file_save_path,
                     '../oov_rate_' + data_type + '_' + data_size + '.txt'),
                'w') as f:

            # word-level (threshold == 1)
            oov_rate = compute_oov_rate(speaker_dict,
                                        word_freq1_vocab_file_path)
            f.write('Word (freq1):\n')
            f.write('  OOV rate (test): %f %%\n' % oov_rate)

            # word-level (threshold == 5)
            oov_rate = compute_oov_rate(speaker_dict,
                                        word_freq5_vocab_file_path)
            f.write('Word (freq5):\n')
            f.write('  OOV rate (test): %f %%\n' % oov_rate)

            # word-level (threshold == 10)
            oov_rate = compute_oov_rate(speaker_dict,
                                        word_freq10_vocab_file_path)
            f.write('Word (freq10):\n')
            f.write('  OOV rate (test): %f %%\n' % oov_rate)

            # word-level (threshold == 15)
            oov_rate = compute_oov_rate(speaker_dict,
                                        word_freq15_vocab_file_path)
            f.write('Word (freq15):\n')
            f.write('  OOV rate (test): %f %%\n' % oov_rate)

    # Tokenize
    print('=====> Tokenize...')
    char2idx = Char2idx(char_vocab_file_path)
    char2idx_capital = Char2idx(char_capital_vocab_file_path,
                                capital_divide=True)
    word2idx_freq1 = Word2idx(word_freq1_vocab_file_path)
    word2idx_freq5 = Word2idx(word_freq5_vocab_file_path)
    word2idx_freq10 = Word2idx(word_freq10_vocab_file_path)
    word2idx_freq15 = Word2idx(word_freq15_vocab_file_path)
    for speaker, utt_dict in tqdm(speaker_dict.items()):
        for utt_name, transcript in utt_dict.items():
            if is_test:
                utt_dict[utt_name] = [transcript] * 6
            else:
                char_indices = char2idx(transcript)
                char_indices_capital = char2idx_capital(transcript)
                word_freq1_indices = word2idx_freq1(transcript)
                word_freq5_indices = word2idx_freq5(transcript)
                word_freq10_indices = word2idx_freq10(transcript)
                word_freq15_indices = word2idx_freq15(transcript)

                char_indices = ' '.join(list(map(str, char_indices.tolist())))
                char_indices_capital = ' '.join(
                    list(map(str, char_indices_capital.tolist())))
                word_freq1_indices = ' '.join(
                    list(map(str, word_freq1_indices.tolist())))
                word_freq5_indices = ' '.join(
                    list(map(str, word_freq5_indices.tolist())))
                word_freq10_indices = ' '.join(
                    list(map(str, word_freq10_indices.tolist())))
                word_freq15_indices = ' '.join(
                    list(map(str, word_freq15_indices.tolist())))

                utt_dict[utt_name] = [
                    char_indices, char_indices_capital, word_freq1_indices,
                    word_freq5_indices, word_freq10_indices,
                    word_freq15_indices
                ]
        speaker_dict[speaker] = utt_dict

    return speaker_dict
Exemplo n.º 22
0
def read_char(label_paths, vocab_file_save_path, save_vocab_file=False,
              is_test=False):
    """Read text transcript.
    Args:
        label_paths (list): list of paths to label files
        vocab_file_save_path (string): path to vocabulary files
        save_vocab_file (string): if True, save vocabulary files
        is_test (bool, optional): set True in case of the test set
    Returns:
        trans_dict (dict):
            key (string) => utterance name
            value (list) => [char_indices, char_indices_capital]
    """
    print('=====> Reading target labels...')
    trans_dict = {}
    char_set, char_capital_set = set([]), set([])
    for label_path in tqdm(label_paths):
        with open(label_path, 'r') as f:
            line = f.readlines()[-1]
            speaker = label_path.split('/')[-2]
            utt_index = basename(label_path).split('.')[0]
            utt_name = speaker + '_' + utt_index

            # Remove 「"」, 「:」, 「;」, 「!」, 「?」, 「,」, 「.」, 「-」
            # Convert to lowercase
            line = re.sub(r'[\":;!?,.-]+', '', line.strip().lower())

            transcript = ' '.join(line.split(' ')[2:])

            # Remove double spaces
            while '  ' in transcript:
                transcript = re.sub(r'  ', ' ', transcript)

            # Remove first and last space
            if transcript[0] == ' ':
                transcript = transcript[1:]
            if transcript[-1] == ' ':
                transcript = transcript[:-1]

            # Capital-divided
            for word in transcript.split(' '):
                if len(word) == 1:
                    char_capital_set.add(word.upper())
                else:
                    # Replace the first character with the capital letter
                    word = word[0].upper() + word[1:]
                    char_capital_set.add(word[0].upper())

                    # Check double-letters
                    skip_flag = False
                    for i in range(1, len(word) - 1, 1):
                        if skip_flag:
                            skip_flag = False
                            continue

                        if not skip_flag and word[i:i + 2] in DOUBLE_LETTERS:
                            char_capital_set.add(word[i:i + 2])
                            skip_flag = True
                        else:
                            char_capital_set.add(word[i])

                    # Final character
                    if not skip_flag:
                        char_capital_set.add(word[-1])

            # Convert space to "_"
            transcript = re.sub(r'\s', SPACE, transcript)

            for c in list(transcript):
                char_set.add(c)

            trans_dict[utt_name] = transcript

            # for debug
            # print(transcript)
            # print(trans_char_capital_divide)

    # Make vocabulary files
    char_vocab_file_path = mkdir_join(vocab_file_save_path, 'character.txt')
    char_capital_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'character_capital_divide.txt')

    # Reserve some indices
    char_set.discard(SPACE)
    char_set.discard(APOSTROPHE)
    char_capital_set.discard(APOSTROPHE)

    # for debug
    # print(sorted(list(char_set)))
    # print(sorted(list(char_capital_set)))

    if save_vocab_file:
        # character-level
        with open(char_vocab_file_path, 'w') as f:
            char_list = sorted(list(char_set)) + [SPACE, APOSTROPHE]
            for char in char_list:
                f.write('%s\n' % char)

        # character-level (capital-divided)
        with open(char_capital_vocab_file_path, 'w') as f:
            char_capital_list = sorted(list(char_capital_set)) + [APOSTROPHE]
            for char in char_capital_list:
                f.write('%s\n' % char)

    # Tokenize
    print('=====> Tokenize...')
    char2idx = Char2idx(char_vocab_file_path)
    char2idx_capital = Char2idx(
        char_capital_vocab_file_path, capital_divide=True)
    for utt_name, transcript in tqdm(trans_dict.items()):
        if is_test:
            trans_dict[utt_name] = [transcript, transcript]
            # NOTE: save as it is
        else:
            char_indices = char2idx(transcript)
            char_indices_capital = char2idx_capital(transcript)

            char_indices = ' '.join(list(map(str, char_indices.tolist())))
            char_indices_capital = ' '.join(
                list(map(str, char_indices_capital.tolist())))

            trans_dict[utt_name] = [char_indices, char_indices_capital]

    return trans_dict
Exemplo n.º 23
0
def read_audio(audio_paths,
               tool,
               config,
               normalize,
               is_training,
               save_path=None,
               save_format=None,
               global_mean_male=None,
               global_std_male=None,
               global_mean_female=None,
               global_std_female=None,
               dtype=np.float32):
    """Read audio files.
    Args:
        audio_paths (list): paths to audio files
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & std over
                      the training set per gender
            speaker => normalize input features by mean & std per speaker
            utterance => normalize input features by mean & std per utterancet
                         data by mean & std per utterance
        is_training (bool, optional):  Set True when proccessing the training set
        save_path (string): path to save npy files
        save_format (string, optional): numpy as htk
        global_mean_male (np.ndarray, optional): global mean of male over
            the training set
        global_std_male (np.ndarray, optional): global standard deviation
            of male over the training set
        global_mean_female (np.ndarray, optional): global mean of female
            over the training set
        global_std_female (np.ndarray, optional): global standard
            deviation of female over the training set
        dtype (optional): the type of data, default is np.float32
    Returns:
        global_mean_male (np.ndarray): global mean of male over the
            training set
        global_std_male (np.ndarray): global standard deviation of male
            over the training set
        global_mean_female (np.ndarray): global mean of female over the
            training set
        global_std_female (np.ndarray): global standard deviation of
            female over the training set
        frame_num_dict (dict):
            key => utterance name
            value => the number of frames
    """
    if not is_training:
        if global_mean_male is None or global_std_male is None:
            raise ValueError(
                'Set global mean & std computed over the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')

    # Read each audio file
    print('=====> Reading audio files...')
    audio_paths_male, audio_paths_female = [], []
    input_data_list_male, input_data_list_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict, speaker_std_dict = {}, {}
    for audio_path in tqdm(audio_paths):
        speaker = audio_path.split('/')[-2]
        gender = speaker[0]  # f (female) or m (male)
        utt_index = basename(audio_path).split('.')[0]

        if tool == 'htk':
            input_utt, sampPeriod, parmKind = read(audio_path)
            # NOTE: audio_path is a htk file path in this case
        elif tool == 'python_speech_features':
            input_utt = w2f_psf(audio_path,
                                feature_type=config['feature_type'],
                                feature_dim=config['channels'],
                                use_energy=config['energy'],
                                use_delta1=config['delta'],
                                use_delta2=config['deltadelta'],
                                window=config['window'],
                                slide=config['slide'])
        elif tool == 'librosa':
            input_utt = w2f_librosa(audio_path,
                                    feature_type=config['feature_type'],
                                    feature_dim=config['channels'],
                                    use_energy=config['energy'],
                                    use_delta1=config['delta'],
                                    use_delta2=config['deltadelta'],
                                    window=config['window'],
                                    slide=config['slide'])

        # for debug
        # print(input_utt.shape)

        if gender == 'm':
            input_data_list_male.append(input_utt)
            audio_paths_male.append(audio_path)
        elif gender == 'f':
            input_data_list_female.append(input_utt)
            audio_paths_female.append(audio_path)
        else:
            raise ValueError('gender is m or f.')

        if is_training:
            speaker = audio_path.split('/')[-2]
            gender = speaker[0]
            frame_num_utt, feat_dim = input_utt.shape

            if gender == 'm':
                total_frame_num_male += frame_num_utt
            elif gender == 'f':
                total_frame_num_female += frame_num_utt
            else:
                raise ValueError('gender is m or f.')

            if normalize == 'speaker':
                # Initialization
                if speaker not in total_frame_num_dict.keys():
                    total_frame_num_dict[speaker] = 0
                    speaker_mean_dict[speaker] = np.zeros((feat_dim, ),
                                                          dtype=dtype)
                    speaker_std_dict[speaker] = np.zeros((feat_dim, ),
                                                         dtype=dtype)

                total_frame_num_dict[speaker] += frame_num_utt
                speaker_mean_dict[speaker] += np.sum(input_utt, axis=0)
    # NOTE: Load all data in advance because TIMIT is a small dataset.

    if is_training and normalize != 'no':
        # Compute speaker mean
        if normalize == 'speaker':
            for speaker in speaker_mean_dict.keys():
                speaker_mean_dict[speaker] /= total_frame_num_dict[speaker]

        # Compute global mean & std per gender
        print('=====> Computing global mean & std over the training set...')
        frame_offset = 0
        feat_dim = input_data_list_male[0].shape[1]
        train_data_male = np.empty((total_frame_num_male, feat_dim))
        train_data_female = np.empty((total_frame_num_female, feat_dim))
        # male
        for input_utt, audio_path in zip(tqdm(input_data_list_male),
                                         audio_paths_male):
            speaker = audio_path.split('/')[-2]
            frame_num_utt = input_utt.shape[0]
            train_data_male[frame_offset:frame_offset +
                            frame_num_utt] = input_utt
            frame_offset += frame_num_utt

            if normalize == 'speaker':
                speaker_std_dict[speaker] += np.sum(
                    np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0)
        # female
        frame_offset = 0
        for input_utt, audio_path in zip(tqdm(input_data_list_female),
                                         audio_paths_female):
            speaker = audio_path.split('/')[-2]
            frame_num_utt = input_utt.shape[0]
            train_data_female[frame_offset:frame_offset +
                              frame_num_utt] = input_utt
            frame_offset += frame_num_utt

            if normalize == 'speaker':
                speaker_std_dict[speaker] += np.sum(
                    np.abs(input_utt - speaker_mean_dict[speaker])**2, axis=0)

        # Compute speaker std
        if normalize == 'speaker':
            for speaker in speaker_std_dict.keys():
                speaker_std_dict[speaker] = np.sqrt(
                    speaker_std_dict[speaker] /
                    (total_frame_num_dict[speaker] - 1))

        global_mean_male = np.mean(train_data_male, axis=0)
        global_std_male = np.std(train_data_male, axis=0)
        global_mean_female = np.mean(train_data_female, axis=0)
        global_std_female = np.std(train_data_female, axis=0)

        if save_path is not None:
            # Save global mean & std
            np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
            np.save(join(save_path, 'global_std_male.npy'), global_std_male)
            np.save(join(save_path, 'global_mean_female.npy'),
                    global_mean_female)
            np.save(join(save_path, 'global_std_female.npy'),
                    global_std_female)

    # Save input features as npy files
    print('=====> Normalization...')
    frame_num_dict = {}
    for input_utt, audio_path in zip(
            tqdm(input_data_list_male + input_data_list_female),
            audio_paths_male + audio_paths_female):
        speaker = audio_path.split('/')[-2]
        utt_index = basename(audio_path).split('.')[0]
        gender = speaker[0]

        if normalize == 'no':
            pass
        elif normalize == 'global' or not is_training:
            # Normalize by global mean & std over the training set
            if gender == 'm':
                input_utt -= global_mean_male
                input_utt /= global_std_male
            elif gender == 'f':
                input_utt -= global_mean_female
                input_utt /= global_std_female
            else:
                raise ValueError('gender is m or f.')
        elif normalize == 'speaker':
            # Normalize by mean & std per speaker
            input_utt -= speaker_mean_dict[speaker]
            input_utt /= speaker_std_dict[speaker]
        elif normalize == 'utterance':
            # Normalize by mean & std per utterance
            utt_mean = np.mean(input_utt, axis=0, dtype=dtype)
            utt_std = np.std(input_utt, axis=0, dtype=dtype)
            input_utt = (input_utt - utt_mean) / utt_std
        else:
            raise ValueError

        frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0]

        if save_path is not None:
            # Save input features
            if save_format == 'numpy':
                np.save(
                    mkdir_join(save_path, speaker,
                               speaker + '_' + utt_index + '.npy'), input_utt)
            elif save_format == 'htk':
                write(input_utt,
                      htk_path=mkdir_join(save_path, speaker,
                                          speaker + '_' + utt_index + '.htk'),
                      sampPeriod=sampPeriod,
                      parmKind=parmKind)
            else:
                raise ValueError('save_format is numpy or htk.')

    if save_path is not None:
        # Save the frame number dictionary
        with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
            pickle.dump(frame_num_dict, f)

    return (global_mean_male, global_std_male, global_mean_female,
            global_std_female, frame_num_dict)
Exemplo n.º 24
0
def main(config_path):

    # Read a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        corpus = config['corpus']
        feature = config['feature']
        param = config['param']

    if corpus['label_type_main'] == 'character':
        output_size_main = 147
    elif corpus['label_type_main'] == 'kanji':
        output_size_main = 3386

    if corpus['label_type_second'] == 'phone':
        output_size_second = 38
    elif corpus['label_type_second'] == 'character':
        output_size_second = 147

    # Model setting
    CTCModel = load(model_type=config['model_name'])
    network = CTCModel(
        batch_size=param['batch_size'],
        input_size=feature['input_size'] * feature['num_stack'],
        num_cell=param['num_cell'],
        num_layer=param['num_layer'],
        num_layer2=param['num_layer2'],
        #    bottleneck_dim=param['bottleneck_dim'],
        output_size=output_size_main,
        output_size2=output_size_second,
        main_task_weight=param['main_task_weight'],
        clip_gradients=param['clip_grad'],
        clip_activation=param['clip_activation'],
        dropout_ratio_input=param['dropout_input'],
        dropout_ratio_hidden=param['dropout_hidden'],
        num_proj=param['num_proj'],
        weight_decay=param['weight_decay'])

    network.model_name = config['model_name'].upper()
    network.model_name += '_' + str(param['num_cell'])
    network.model_name += '_' + str(param['num_layer'])
    network.model_name += '_' + str(param['num_layer2'])
    network.model_name += '_' + param['optimizer']
    network.model_name += '_lr' + str(param['learning_rate'])
    if param['num_proj'] != 0:
        network.model_name += '_proj' + str(param['num_proj'])
    if feature['num_stack'] != 1:
        network.model_name += '_stack' + str(feature['num_stack'])
    if param['weight_decay'] != 0:
        network.model_name += '_weightdecay' + str(param['weight_decay'])
    network.model_name += '_taskweight' + str(param['main_task_weight'])

    # Set save path
    network.model_dir = mkdir('/n/sd8/inaguma/result/csj/monolog/')
    network.model_dir = mkdir_join(network.model_dir, 'ctc')
    network.model_dir = mkdir_join(
        network.model_dir,
        corpus['label_type_main'] + '_' + corpus['label_type_second'])
    network.model_dir = mkdir_join(network.model_dir,
                                   corpus['train_data_size'])
    network.model_dir = mkdir_join(network.model_dir, network.model_name)

    # Reset model directory
    if not isfile(join(network.model_dir, 'complete.txt')):
        tf.gfile.DeleteRecursively(network.model_dir)
        tf.gfile.MakeDirs(network.model_dir)
    else:
        raise ValueError('File exists.')

    # Set process name
    setproctitle('multitaskctc_csj_' + corpus['label_type_main'] + '_' +
                 corpus['label_type_second'] + '_' + corpus['train_data_size'])

    # Save config file
    shutil.copyfile(config_path, join(network.model_dir, 'config.yml'))

    sys.stdout = open(join(network.model_dir, 'train.log'), 'w')
    print(network.model_name)
    do_train(network=network,
             optimizer=param['optimizer'],
             learning_rate=param['learning_rate'],
             batch_size=param['batch_size'],
             epoch_num=param['num_epoch'],
             label_type_main=corpus['label_type_main'],
             label_type_second=corpus['label_type_second'],
             num_stack=feature['num_stack'],
             num_skip=feature['num_skip'],
             train_data_size=corpus['train_data_size'])
    sys.stdout = sys.__stdout__
Exemplo n.º 25
0
def main(data_size):

    for data_type in [
            'train', 'dev_clean', 'dev_other', 'test_clean', 'test_other'
    ]:
        print('=' * 50)
        print(' ' * 20 + data_type + ' (' + data_size + ')' + ' ' * 20)
        print('=' * 50)

        ########################################
        # inputs
        ########################################
        print('=> Processing input data...')
        if args.save_format in ['numpy', 'htk']:
            input_save_path = mkdir_join(args.feature_save_path,
                                         args.save_format, data_size)
            if isfile(join(input_save_path, data_type, 'complete.txt')):
                print('Already exists.')
            else:
                if data_type == 'train':
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type='train' + data_size)
                    else:
                        audio_paths = path.wav(data_type='train' + data_size)
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None
                else:
                    if args.tool == 'htk':
                        audio_paths = path.htk(data_type=data_type)
                    else:
                        audio_paths = path.wav(data_type=data_type)
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))

                read_audio(audio_paths=audio_paths,
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           speaker_gender_dict=path.speaker_gender_dict,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_mean_female=global_mean_female,
                           global_std_male=global_std_male,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # librispeech/feature/save_format/data_size/data_type/speaker/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'),
                      'w') as f:
                f.write('')

        ########################################
        # labels
        ########################################
        print('\n=> Processing transcripts...')
        if data_type == 'train':
            label_paths = path.trans(data_type='train' + data_size)
        else:
            label_paths = path.trans(data_type=data_type)
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if 'test' in data_type else False

        speaker_dict = read_trans(label_paths=label_paths,
                                  data_size=data_size,
                                  vocab_file_save_path=mkdir_join(
                                      './config', 'vocab_files'),
                                  save_vocab_file=save_vocab_file,
                                  is_test=is_test,
                                  data_type=data_type)

        ########################################
        # dataset (csv)
        ########################################
        print('\n=> Saving dataset files...')
        dataset_save_path = mkdir_join(args.dataset_save_path,
                                       args.save_format, data_size, data_type)
        df_columns = ['frame_num', 'input_path', 'transcript']
        df_char = pd.DataFrame([], columns=df_columns)
        df_char_capital = pd.DataFrame([], columns=df_columns)
        df_word_freq1 = pd.DataFrame([], columns=df_columns)
        df_word_freq5 = pd.DataFrame([], columns=df_columns)
        df_word_freq10 = pd.DataFrame([], columns=df_columns)
        df_word_freq15 = pd.DataFrame([], columns=df_columns)

        with open(join(input_save_path, data_type, 'frame_num.pickle'),
                  'rb') as f:
            frame_num_dict = pickle.load(f)

        utt_count = 0
        df_char_list, df_char_capital_list = [], []
        df_word_freq1_list, df_word_freq5_list = [], []
        df_word_freq10_list, df_word_freq15_list = [], []
        for speaker, utt_dict in tqdm(speaker_dict.items()):
            for utt_name, indices_list in utt_dict.items():
                if args.save_format == 'numpy':
                    input_utt_save_path = join(input_save_path, data_type,
                                               speaker, utt_name + '.npy')
                elif args.save_format == 'htk':
                    input_utt_save_path = join(input_save_path, data_type,
                                               speaker, utt_name + '.htk')
                elif args.save_format == 'wav':
                    input_utt_save_path = path.utt2wav(utt_name)
                else:
                    raise ValueError('save_format is numpy or htk or wav.')

                assert isfile(input_utt_save_path)
                frame_num = frame_num_dict[utt_name]

                char_indices, char_indices_capital, word_freq1_indices = indices_list[:
                                                                                      3]
                word_freq5_indices, word_freq10_indices, word_freq15_indices = indices_list[
                    3:6]

                df_char = add_element(
                    df_char, [frame_num, input_utt_save_path, char_indices])
                df_char_capital = add_element(
                    df_char_capital,
                    [frame_num, input_utt_save_path, char_indices_capital])
                df_word_freq1 = add_element(
                    df_word_freq1,
                    [frame_num, input_utt_save_path, word_freq1_indices])
                df_word_freq5 = add_element(
                    df_word_freq5,
                    [frame_num, input_utt_save_path, word_freq5_indices])
                df_word_freq10 = add_element(
                    df_word_freq10,
                    [frame_num, input_utt_save_path, word_freq10_indices])
                df_word_freq15 = add_element(
                    df_word_freq15,
                    [frame_num, input_utt_save_path, word_freq15_indices])
                utt_count += 1

                # Reset
                if utt_count == 50000:
                    df_char_list.append(df_char)
                    df_char_capital_list.append(df_char_capital)
                    df_word_freq1_list.append(df_word_freq1)
                    df_word_freq5_list.append(df_word_freq5)
                    df_word_freq10_list.append(df_word_freq10)
                    df_word_freq15_list.append(df_word_freq15)

                    df_char = pd.DataFrame([], columns=df_columns)
                    df_char_capital = pd.DataFrame([], columns=df_columns)
                    df_word_freq1 = pd.DataFrame([], columns=df_columns)
                    df_word_freq5 = pd.DataFrame([], columns=df_columns)
                    df_word_freq10 = pd.DataFrame([], columns=df_columns)
                    df_word_freq15 = pd.DataFrame([], columns=df_columns)
                    utt_count = 0

        # Last dataframe
        df_char_list.append(df_char)
        df_char_capital_list.append(df_char_capital)
        df_word_freq1_list.append(df_word_freq1)
        df_word_freq5_list.append(df_word_freq5)
        df_word_freq10_list.append(df_word_freq10)
        df_word_freq15_list.append(df_word_freq15)

        # Concatenate all dataframes
        df_char = df_char_list[0]
        df_char_capital = df_char_capital_list[0]
        df_word_freq1 = df_word_freq1_list[0]
        df_word_freq5 = df_word_freq5_list[0]
        df_word_freq10 = df_word_freq10_list[0]
        df_word_freq15 = df_word_freq15_list[0]

        for df_i in df_char_list[1:]:
            df_char = pd.concat([df_char, df_i], axis=0)
        for df_i in df_char_list[1:]:
            df_char_capital = pd.concat([df_char_capital, df_i], axis=0)
        for df_i in df_word_freq1_list[1:]:
            df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0)
        for df_i in df_word_freq5_list[1:]:
            df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0)
        for df_i in df_word_freq10_list[1:]:
            df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0)
        for df_i in df_word_freq15_list[1:]:
            df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0)

        df_char.to_csv(join(dataset_save_path, 'character.csv'))
        df_char_capital.to_csv(
            join(dataset_save_path, 'character_capital_divide.csv'))
        df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv'))
        df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv'))
        df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv'))
        df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))
Exemplo n.º 26
0
def read_audio(audio_paths,
               speaker_dict,
               tool,
               config,
               normalize,
               is_training,
               save_path=None,
               save_format='numpy',
               global_mean_male=None,
               global_mean_female=None,
               global_std_male=None,
               global_std_female=None,
               dtype=np.float32):
    """Read HTK or WAV files.
    Args:
        audio_paths (list): paths to HTK or WAV files
        speaker_dict (dict): dictionary of speakers
            key => speaker
            value => dictionary of utterance information of each speaker
                key => utterance index
                value => [start_frame, end_frame, trans_kana, trans_kanji]
        tool (string): the tool to extract features,
            htk or librosa or python_speech_features
        config (dict): a configuration for feature extraction
        normalize (string):
            no => normalization will be not conducted
            global => normalize input features by global mean & std over
                      the training set per gender
            speaker => normalize input features by mean & std per speaker
            utterance => normalize input features by mean & std per utterancet
                         data by mean & std per utterance
        is_training (bool, optional): training or not
        save_path (string): path to save npy files
        save_format (string, optional): numpy or htk
        global_mean_male (np.ndarray, optional): global mean of male over the
            training set
        global_std_male (np.ndarray, optional): global standard deviation of
            male over the training set
        global_mean_female (np.ndarray, optional): global mean of female over
            the training set
        global_std_female (np.ndarray, optional): global standard deviation of
            female over the training set
        dtype (optional): the type of data, default is np.float32
    Returns:
        global_mean_male (np.ndarray): global mean of male over the
            training set
        global_std_male (np.ndarray): global standard deviation of male
            over the training set
        global_mean_female (np.ndarray): global mean of female over the
            training set
        global_std_female (np.ndarray): global standard deviation of
            female over the training set
        frame_num_dict (dict):
            key => utterance name
            value => the number of frames
    """
    if not is_training:
        if global_mean_male is None or global_mean_female is None:
            raise ValueError('Set mean & std computed in the training set.')
    if normalize not in ['global', 'speaker', 'utterance', 'no']:
        raise ValueError(
            'normalize must be "utterance" or "speaker" or "global" or "no".')
    if tool not in ['htk', 'python_speech_features', 'librosa']:
        raise TypeError('tool must be "htk" or "python_speech_features"' +
                        ' or "librosa".')

    audio_path_list_male, audio_path_list_female = [], []
    total_frame_num_male, total_frame_num_female = 0, 0
    total_frame_num_dict = {}
    speaker_mean_dict = {}

    # NOTE: 講演ごとに異なるspeakerとみなす

    # Loop 1: Computing global mean and statistics
    if is_training and normalize != 'no':
        print('=====> Reading audio files...')
        for i, audio_path in enumerate(tqdm(audio_paths)):
            speaker = basename(audio_path).split('.')[0]

            # Divide each audio file into utterances
            _, input_utt_sum, speaker_mean, _, total_frame_num_speaker = segment(
                audio_path,
                speaker,
                speaker_dict[speaker],
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            if i == 0:
                # Initialize global statistics
                feature_dim = input_utt_sum.shape[0]
                global_mean_male = np.zeros((feature_dim, ), dtype=dtype)
                global_mean_female = np.zeros((feature_dim, ), dtype=dtype)
                global_std_male = np.zeros((feature_dim, ), dtype=dtype)
                global_std_female = np.zeros((feature_dim, ), dtype=dtype)

            # For computing global mean
            if speaker[3] == 'M':
                audio_path_list_male.append(audio_path)
                global_mean_male += input_utt_sum
                total_frame_num_male += total_frame_num_speaker
            elif speaker[3] == 'F':
                audio_path_list_female.append(audio_path)
                global_mean_female += input_utt_sum
                total_frame_num_female += total_frame_num_speaker
            else:
                raise ValueError

            # For computing speaker stddev
            if normalize == 'speaker':
                speaker_mean_dict[speaker] = speaker_mean
                total_frame_num_dict[speaker] = total_frame_num_speaker
                # NOTE: speaker mean is already computed

        print('=====> Computing global mean & stddev...')
        # Compute global mean per gender
        global_mean_male /= total_frame_num_male
        global_mean_female /= total_frame_num_female

        for audio_path in tqdm(audio_paths):
            speaker = basename(audio_path).split('.')[0]

            # Divide each audio into utterances
            input_data_dict_speaker, _, _, _, _ = segment(
                audio_path,
                speaker,
                speaker_dict[speaker],
                is_training=True,
                sil_duration=0,
                tool=tool,
                config=config)

            # For computing global stddev
            if speaker[3] == 'M':
                for input_utt in input_data_dict_speaker.values():
                    global_std_male += np.sum(np.abs(input_utt -
                                                     global_mean_male)**2,
                                              axis=0)
            elif speaker[3] == 'F':
                for input_utt in input_data_dict_speaker.values():
                    global_std_female += np.sum(np.abs(input_utt -
                                                       global_mean_female)**2,
                                                axis=0)
            else:
                raise ValueError

        # Compute global stddev per gender
        global_std_male = np.sqrt(global_std_male / (total_frame_num_male - 1))
        global_std_female = np.sqrt(global_std_female /
                                    (total_frame_num_female - 1))

        if save_path is not None:
            # Save global mean & std per gender
            np.save(join(save_path, 'global_mean_male.npy'), global_mean_male)
            np.save(join(save_path, 'global_mean_female.npy'),
                    global_mean_female)
            np.save(join(save_path, 'global_std_male.npy'), global_std_male)
            np.save(join(save_path, 'global_std_female.npy'),
                    global_std_female)

    # Loop 2: Normalization and Saving
    print('=====> Normalization...')
    frame_num_dict = {}
    sampPeriod, parmKind = None, None
    for audio_path in tqdm(audio_paths):
        speaker = basename(audio_path).split('.')[0]

        if normalize == 'speaker' and is_training:
            speaker_mean = speaker_mean_dict[speaker]
        else:
            speaker_mean = None

        # Divide each audio into utterances
        input_data_dict_speaker, _, speaker_mean, speaker_std, _ = segment(
            audio_path,
            speaker,
            speaker_dict[speaker],
            is_training=is_training,
            sil_duration=0,
            tool=tool,
            config=config,
            mean=speaker_mean)  # for compute speaker sttdev
        # NOTE: input_data_dict_speaker have been not normalized yet

        for utt_index, input_utt in input_data_dict_speaker.items():

            if normalize == 'no':
                pass
            elif normalize == 'global' or not is_training:
                # Normalize by mean & std over the training set per gender
                if speaker[3] == 'M':
                    input_utt -= global_mean_male
                    input_utt /= global_std_male
                elif speaker[3] == 'F':
                    input_utt -= global_mean_female
                    input_utt /= global_std_female
                else:
                    raise ValueError
            elif normalize == 'speaker':
                # Normalize by mean & std per speaker
                input_utt = (input_utt - speaker_mean) / speaker_std
            elif normalize == 'utterance':
                # Normalize by mean & std per utterance
                utt_mean = np.mean(input_utt, axis=0, dtype=dtype)
                utt_std = np.std(input_utt, axis=0, dtype=dtype)
                input_utt = (input_utt - utt_mean) / utt_std
            else:
                raise ValueError

            frame_num_dict[speaker + '_' + utt_index] = input_utt.shape[0]

            if save_path is not None:
                # Save input features
                if save_format == 'numpy':
                    input_data_save_path = mkdir_join(
                        save_path, speaker, speaker + '_' + utt_index + '.npy')
                    np.save(input_data_save_path, input_utt)
                elif save_format == 'htk':
                    if sampPeriod is None:
                        _, sampPeriod, parmKind = read(audio_path)
                    write(input_utt,
                          htk_path=mkdir_join(
                              save_path, speaker,
                              speaker + '_' + utt_index + '.htk'),
                          sampPeriod=sampPeriod,
                          parmKind=parmKind)
                else:
                    raise ValueError('save_format is numpy or htk.')

    if save_path is not None:
        # Save the frame number dictionary
        with open(join(save_path, 'frame_num.pickle'), 'wb') as f:
            pickle.dump(frame_num_dict, f)

    return (global_mean_male, global_mean_female, global_std_male,
            global_std_female, frame_num_dict)
Exemplo n.º 27
0
def main():

    for data_type in ['train', 'dev', 'test']:
        print('=' * 50)
        print(' ' * 20 + data_type + ' ' * 20)
        print('=' * 50)

        ########################################
        # inputs
        ########################################
        print('=> Processing input data...')
        if args.save_format in ['numpy', 'htk']:
            input_save_path = mkdir_join(
                args.feature_save_path, args.save_format)
            if isfile(join(input_save_path, data_type, 'complete.txt')):
                print('Already exists.')
            else:
                if args.tool == 'htk':
                    audio_paths = path.htk(data_type=data_type)
                else:
                    audio_paths = path.wav(data_type=data_type)

                if data_type != 'train':
                    is_training = False

                    # Load statistics over train dataset
                    global_mean_male = np.load(
                        join(input_save_path, 'train/global_mean_male.npy'))
                    global_std_male = np.load(
                        join(input_save_path, 'train/global_std_male.npy'))
                    global_mean_female = np.load(
                        join(input_save_path, 'train/global_mean_female.npy'))
                    global_std_female = np.load(
                        join(input_save_path, 'train/global_std_female.npy'))
                else:
                    is_training = True
                    global_mean_male, global_std_male, global_mean_female, global_std_female = None, None, None, None

                # Read htk or wav files, and save input data and frame num dict
                read_audio(audio_paths=audio_paths,
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean_male=global_mean_male,
                           global_std_male=global_std_male,
                           global_mean_female=global_mean_female,
                           global_std_female=global_std_female)
                # NOTE: ex.) save_path:
                # timit/feature/save_format/data_type/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f:
                f.write('')

        ########################################
        # labels (phone)
        ########################################
        print('\n=> Processing transcripts (phone)...')
        save_vocab_file = True if data_type == 'train' else False
        is_test = True if data_type == 'test' else False
        trans_dict = read_phone(
            label_paths=path.phone(data_type=data_type),
            vocab_file_save_path=mkdir_join('./config', 'vocab_files'),
            save_vocab_file=save_vocab_file,
            is_test=is_test)

        ########################################
        # dataset (phone, csv)
        ########################################
        print('\n=> Saving dataset files (phone)...')
        dataset_save_path = mkdir_join(
            args.dataset_save_path, args.save_format, data_type)
        df_columns = ['frame_num', 'input_path', 'transcript']
        df_phone61 = pd.DataFrame([], columns=df_columns)
        df_phone48 = pd.DataFrame([], columns=df_columns)
        df_phone39 = pd.DataFrame([], columns=df_columns)

        with open(join(input_save_path, data_type, 'frame_num.pickle'), 'rb') as f:
            frame_num_dict = pickle.load(f)

        for utt_name, trans_list in tqdm(trans_dict.items()):
            if args.save_format == 'numpy':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(
                    input_save_path, data_type, speaker, utt_name + '.npy')
            elif args.save_format == 'htk':
                speaker = utt_name.split('_')[0]
                input_utt_save_path = join(
                    input_save_path, data_type, speaker, utt_name + '.htk')
            elif args.save_format == 'wav':
                input_utt_save_path = path.utt2wav(utt_name)
            else:
                raise ValueError('save_format is numpy or htk or wav.')

            assert isfile(input_utt_save_path)
            frame_num = frame_num_dict[utt_name]

            phone61_indices, phone48_indices, phone39_indices = trans_list

            df_phone61 = add_element(
                df_phone61, [frame_num, input_utt_save_path, phone61_indices])
            df_phone48 = add_element(
                df_phone48, [frame_num, input_utt_save_path, phone48_indices])
            df_phone39 = add_element(
                df_phone39, [frame_num, input_utt_save_path, phone39_indices])

        df_phone61.to_csv(join(dataset_save_path, 'phone61.csv'))
        df_phone48.to_csv(join(dataset_save_path, 'phone48.csv'))
        df_phone39.to_csv(join(dataset_save_path, 'phone39.csv'))
Exemplo n.º 28
0
def read_sdb(label_paths,
             data_size,
             vocab_file_save_path,
             is_test=False,
             save_vocab_file=False,
             data_type=None):
    """Read transcripts (.sdb) & save files (.npy).
    Args:
        label_paths (list): list of paths to label files
        data_size (string): fullset or subset
        vocab_file_save_path (string): path to vocabulary files
        is_test (bool, optional): Set True if save as the test set
        save_vocab_file (bool, optional): if True, save vocabulary files
        data_type (string, optional): eval1 or eval2 or eval3
    Returns:
        speaker_dict (dict): the dictionary of utterances of each speaker
            key (string) => speaker
            value (dict) => the dictionary of utterance information of each speaker
                key (string) => utterance index
                value (list) => [start_frame, end_frame,
                                kanji_indices, kanji_div_indices,
                                kana_indices, kana_div_indices,
                                phone_indices, phone_div_indices,
                                word_freq1_indices, word_freq5_indices,
                                word_freq10_indices, word_freq15_indices]
    """
    # Make mapping dictionary from kana to phone
    kana_list = []
    kana2phone_dict = {}
    phone_set = set([])
    with open(join(vocab_file_save_path, '../kana2phone.txt'), 'r') as f:
        for line in f:
            line = line.strip().split('+')
            kana, phone_seq = line
            kana_list.append(kana)
            kana2phone_dict[kana] = phone_seq
            for phone in phone_seq.split(' '):
                phone_set.add(phone)
        kana2phone_dict[SPACE] = SIL

    print('=====> Reading target labels...')
    speaker_dict = OrderedDict()
    char_set = set([])
    word_count_dict = {}
    vocab_set = set([])
    for label_path in tqdm(label_paths):
        col_names = [j for j in range(25)]
        df = pd.read_csv(label_path,
                         names=col_names,
                         encoding='SHIFT-JIS',
                         delimiter='\t',
                         header=None)

        utt_dict = OrderedDict()
        utt_index_pre = 1
        start_frame_pre, end_frame_pre = None, None
        trans_kana, trans_kanji, trans_pos = '', '', ''
        speaker = basename(label_path).split('.')[0]
        for key, row in df.iterrows():

            # From kaldi
            time = row[3]  # Time information for segment
            word = row[5]  # Word
            # num = row[9]  # Number and point
            # About morpheme
            if isinstance(row[11], str):
                pos = row[11]  # Part Of Speech
            else:
                pos = ''
            # acf = row[12]  # A Conjugated Form
            # kacf = row[13]  # Kind of A Conjugated Form
            # kav = row[14]  # Kind of Aulxiliary Verb
            # ec = row[15]  # Euphonic Change
            # other = row[16]  # Other information
            pron = row[10]  # Pronunciation for lexicon

            utt_index = int(time.split(' ')[0])
            segment = time.split(' ')[1].split('-')
            start_frame = int(float(segment[0]) * 100 + 0.5)
            end_frame = int(float(segment[1]) * 100 + 0.5)
            if start_frame_pre is None:
                start_frame_pre = start_frame
            if end_frame_pre is None:
                end_frame_pre = end_frame

            # Stack word in the same utterance
            if utt_index == utt_index_pre:
                trans_kanji += word + ' '
                trans_kana += pron + ' '
                if pos != '':
                    trans_pos += pos + ' '
                utt_index_pre = utt_index
                end_frame_pre = end_frame
                continue

            # Count the number of brackets
            if trans_kanji.count('(') != trans_kanji.count(')'):
                trans_kanji += word + ' '
                trans_kana += pron + ' '
                if pos != '':
                    trans_pos += pos + ' '
                utt_index_pre = utt_index
                end_frame_pre = end_frame
                continue

            if trans_kana.count('(') != trans_kana.count(')'):
                trans_kanji += word + ' '
                trans_kana += pron + ' '
                if pos != '':
                    trans_pos += pos + ' '
                utt_index_pre = utt_index
                end_frame_pre = end_frame
                continue

            # if '<P:' in trans_kana:
            #     print(label_path)
            #     print(trans_kanji)
            #     print(trans_kana)

            # Clean transcript
            trans_kanji = fix_transcript(trans_kanji)
            trans_kana = fix_transcript(trans_kana)

            # Remove double space
            while '  ' in trans_kanji:
                trans_kanji = re.sub(r'[\s]+', ' ', trans_kanji)
            while '  ' in trans_kana:
                trans_kana = re.sub(r'[\s]+', ' ', trans_kana)
            while '  ' in trans_pos:
                trans_pos = re.sub(r'[\s]+', ' ', trans_pos)

            # Skip silence only utterance
            if trans_kanji.replace(' ', '') != '' and len(trans_pos) > 0:

                # Remove the first and last space
                if len(trans_kanji) > 0 and trans_kanji[0] == ' ':
                    trans_kanji = trans_kanji[1:]
                if len(trans_kana) > 0 and trans_kana[0] == ' ':
                    trans_kana = trans_kana[1:]
                if len(trans_kanji) > 0 and trans_kanji[-1] == ' ':
                    trans_kanji = trans_kanji[:-1]
                if len(trans_kana) > 0 and trans_kana[-1] == ' ':
                    trans_kana = trans_kana[:-1]

                # Convert space to "_"
                trans_kanji = re.sub(r'\s', SPACE, trans_kanji)
                trans_kana = re.sub(r'\s', SPACE, trans_kana)

                # For exception
                if trans_kana[0:2] == 'Z_':
                    trans_kana = trans_kana[2:]

                for c in list(trans_kanji):
                    char_set.add(c)

                # Count words
                word_list = trans_kanji.split(SPACE)
                for w in word_list:
                    vocab_set.add(w)
                    if w not in word_count_dict.keys():
                        word_count_dict[w] = 0
                    word_count_dict[w] += 1

                # Convert kana character to phone
                trans_phone = ' '.join(kana2phone(trans_kana, kana2phone_dict))

                utt_dict[str(utt_index - 1).zfill(4)] = [
                    start_frame_pre, end_frame_pre, trans_kanji, trans_kana,
                    trans_phone
                ]

                # for debug
                # print(trans_kanji)
                # print(trans_kana)
                # print(trans_phone)
                # print('-----')

            # Initialization
            trans_kanji = word + ' '
            trans_kana = pron + ' '
            if pos == '':
                trans_pos = ''
            else:
                trans_pos = pos + ' '
            utt_index_pre = utt_index
            start_frame_pre = start_frame
            end_frame_pre = end_frame

        # Register all utterances of each speaker
        speaker_dict[speaker] = utt_dict

    # Make vocabulary files
    kanji_vocab_file_path = mkdir_join(vocab_file_save_path,
                                       'kanji_' + data_size + '.txt')
    kanji_div_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'kanji_divide_' + data_size + '.txt')
    kana_vocab_file_path = mkdir_join(vocab_file_save_path,
                                      'kana_' + data_size + '.txt')
    kana_div_vocab_file_path = mkdir_join(vocab_file_save_path,
                                          'kana_divide_' + data_size + '.txt')
    phone_vocab_file_path = mkdir_join(vocab_file_save_path,
                                       'phone_' + data_size + '.txt')
    phone_div_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'phone_divide_' + data_size + '.txt')
    word_freq1_vocab_file_path = mkdir_join(vocab_file_save_path,
                                            'word_freq1_' + data_size + '.txt')
    word_freq5_vocab_file_path = mkdir_join(vocab_file_save_path,
                                            'word_freq5_' + data_size + '.txt')
    word_freq10_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'word_freq10_' + data_size + '.txt')
    word_freq15_vocab_file_path = mkdir_join(
        vocab_file_save_path, 'word_freq15_' + data_size + '.txt')

    # Reserve some indices
    char_set.discard(SPACE)

    # for debug
    # print(sorted(list(char_set)))

    if save_vocab_file:
        # character-level (kanji, kanji_divide)
        kanji_set = set([])
        for char in char_set:
            if (not is_hiragana(char)) and (not is_katakana(char)):
                kanji_set.add(char)
        for kana in kana_list:
            kanji_set.add(kana)
            kanji_set.add(jaconv.kata2hira(kana))
        with open(kanji_vocab_file_path,
                  'w') as f, open(kanji_div_vocab_file_path, 'w') as f_div:
            kanji_list = sorted(list(kanji_set))
            for kanji in kanji_list:
                f.write('%s\n' % kanji)
            for kanji in kanji_list + [SPACE]:
                f_div.write('%s\n' % kanji)

        # character-level (kana, kana_divide)
        with open(kana_vocab_file_path,
                  'w') as f, open(kana_div_vocab_file_path, 'w') as f_div:
            kana_list_tmp = sorted(kana_list)
            for kana in kana_list_tmp:
                f.write('%s\n' % kana)
            for kana in kana_list_tmp + [SPACE]:
                f_div.write('%s\n' % kana)

        # phone-level (phone, phone_divide)
        with open(phone_vocab_file_path,
                  'w') as f, open(phone_div_vocab_file_path, 'w') as f_div:
            phone_list = sorted(list(phone_set))
            for phone in phone_list:
                f.write('%s\n' % phone)
            for phone in phone_list + [SIL]:
                f_div.write('%s\n' % phone)

        # word-level (threshold == 1)
        with open(word_freq1_vocab_file_path, 'w') as f:
            vocab_list = sorted(list(vocab_set)) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 5)
        with open(word_freq5_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 5
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 10)
        with open(word_freq10_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 10
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

        # word-level (threshold == 15)
        with open(word_freq15_vocab_file_path, 'w') as f:
            vocab_list = sorted([
                word
                for word, freq in list(word_count_dict.items()) if freq >= 15
            ]) + [OOV]
            for word in vocab_list:
                f.write('%s\n' % word)

    # Compute OOV rate
    if is_test:
        with open(
                join(vocab_file_save_path,
                     '../oov_rate_' + data_type + '_' + data_size + '.txt'),
                'w') as f:

            # word-level (threshold == 1)
            oov_rate = compute_oov_rate(speaker_dict,
                                        word_freq1_vocab_file_path)
            f.write('Word (freq1):\n')
            f.write('  OOV rate (test): %f %%\n' % oov_rate)

            # word-level (threshold == 5)
            oov_rate = compute_oov_rate(speaker_dict,
                                        word_freq5_vocab_file_path)
            f.write('Word (freq5):\n')
            f.write('  OOV rate (test): %f %%\n' % oov_rate)

            # word-level (threshold == 10)
            oov_rate = compute_oov_rate(speaker_dict,
                                        word_freq10_vocab_file_path)
            f.write('Word (freq10):\n')
            f.write('  OOV rate (test): %f %%\n' % oov_rate)

            # word-level (threshold == 15)
            oov_rate = compute_oov_rate(speaker_dict,
                                        word_freq15_vocab_file_path)
            f.write('Word (freq15):\n')
            f.write('  OOV rate (test): %f %%\n' % oov_rate)

    # Tokenize
    print('=====> Tokenize...')
    kanji2idx = Char2idx(kanji_vocab_file_path, double_letter=True)
    kanji2idx_div = Char2idx(kanji_div_vocab_file_path, double_letter=True)
    kana2idx = Char2idx(kana_vocab_file_path, double_letter=True)
    kana2idx_div = Char2idx(kana_div_vocab_file_path, double_letter=True)
    phone2idx = Phone2idx(phone_vocab_file_path)
    phone2idx_div = Phone2idx(phone_div_vocab_file_path)
    word2idx_freq1 = Word2idx(word_freq1_vocab_file_path)
    word2idx_freq5 = Word2idx(word_freq5_vocab_file_path)
    word2idx_freq10 = Word2idx(word_freq10_vocab_file_path)
    word2idx_freq15 = Word2idx(word_freq15_vocab_file_path)
    for speaker, utt_dict in tqdm(speaker_dict.items()):
        for utt_index, utt_info in utt_dict.items():
            start_frame, end_frame, trans_kanji, trans_kana, trans_phone = utt_info
            if is_test:
                utt_dict[utt_index] = [
                    start_frame, end_frame,
                    trans_kanji.replace(SPACE, ''), trans_kanji,
                    trans_kana.replace(SPACE, ''), trans_kana,
                    trans_phone.replace(SIL, '').replace('  ',
                                                         ' '), trans_phone,
                    trans_kanji, trans_kanji, trans_kanji, trans_kanji
                ]
            else:
                kanji_indices = kanji2idx(trans_kanji.replace(SPACE, ''))
                kanji_div_indices = kanji2idx_div(trans_kanji)
                kana_indices = kana2idx(trans_kana.replace(SPACE, ''))
                kana_div_indices = kana2idx_div(trans_kana)
                phone_indices = phone2idx(
                    trans_phone.replace(SIL, '').replace('  ', ' '))
                phone_div_indices = phone2idx_div(trans_phone)
                word_freq1_indices = word2idx_freq1(trans_kanji)
                word_freq5_indices = word2idx_freq5(trans_kanji)
                word_freq10_indices = word2idx_freq10(trans_kanji)
                word_freq15_indices = word2idx_freq15(trans_kanji)

                kanji_indices = int2str(kanji_indices)
                kanji_div_indices = int2str(kanji_div_indices)
                kana_indices = int2str(kana_indices)
                kana_div_indices = int2str(kana_div_indices)
                phone_indices = int2str(phone_indices)
                phone_div_indices = int2str(phone_div_indices)
                word_freq1_indices = int2str(word_freq1_indices)
                word_freq5_indices = int2str(word_freq5_indices)
                word_freq10_indices = int2str(word_freq10_indices)
                word_freq15_indices = int2str(word_freq15_indices)

                utt_dict[utt_index] = [
                    start_frame, end_frame, kanji_indices, kanji_div_indices,
                    kana_indices, kana_div_indices, phone_indices,
                    phone_div_indices, word_freq1_indices, word_freq5_indices,
                    word_freq10_indices, word_freq15_indices
                ]

        speaker_dict[speaker] = utt_dict

    return speaker_dict
Exemplo n.º 29
0
def read_phone(label_paths,
               vocab_file_save_path,
               save_vocab_file=False,
               is_test=False):
    """Read phone transcript.
    Args:
        label_paths (list): list of paths to label files
        vocab_file_save_path (string): path to vocabulary files
        save_vocab_file (bool, optional): if True, save vocabulary files
        is_test (bool, optional): set True in case of the test set
    Returns:
        text_dict (dict):
            key (string) => utterance name
            value (list) => list of [phone61_indices, phone48_indices, phone39_indices]
    """
    print('=====> Reading target labels...')

    # Make the mapping file (from phone to index)
    phone2phone_map_file_path = join(vocab_file_save_path,
                                     '../phone2phone.txt')
    phone61_set, phone48_set, phone39_set = set([]), set([]), set([])
    with open(phone2phone_map_file_path, 'r') as f:
        for line in f:
            line = line.strip().split()
            if line[1] != 'nan':
                phone61_set.add(line[0])
                phone48_set.add(line[1])
                phone39_set.add(line[2])
            else:
                # Ignore "q" if phone39 or phone48
                phone61_set.add(line[0])

    phone61_vocab_map_file_path = mkdir_join(vocab_file_save_path,
                                             'phone61.txt')
    phone48_vocab_map_file_path = mkdir_join(vocab_file_save_path,
                                             'phone48.txt')
    phone39_vocab_map_file_path = mkdir_join(vocab_file_save_path,
                                             'phone39.txt')

    # Save mapping file
    if save_vocab_file:
        with open(phone61_vocab_map_file_path, 'w') as f:
            for phone in sorted(list(phone61_set)):
                f.write('%s\n' % phone)
        with open(phone48_vocab_map_file_path, 'w') as f:
            for phone in sorted(list(phone48_set)):
                f.write('%s\n' % phone)
        with open(phone39_vocab_map_file_path, 'w') as f:
            for phone in sorted(list(phone39_set)):
                f.write('%s\n' % phone)

    trans_dict = {}
    for label_path in tqdm(label_paths):
        speaker = label_path.split('/')[-2]
        utt_index = basename(label_path).split('.')[0]
        utt_name = speaker + '_' + utt_index

        phone61_list = []
        with open(label_path, 'r') as f:
            for line in f:
                line = line.strip().split(' ')
                # start_frame = line[0]
                # end_frame = line[1]
                phone61_list.append(line[2])

        # Map from 61 phones to the corresponding phones
        phone48_list = map_phone2phone(phone61_list, 'phone48',
                                       phone2phone_map_file_path)
        phone39_list = map_phone2phone(phone61_list, 'phone39',
                                       phone2phone_map_file_path)

        # Convert to string
        trans_phone61 = ' '.join(phone61_list)
        trans_phone48 = ' '.join(phone48_list)
        trans_phone39 = ' '.join(phone39_list)

        # for debug
        # print(trans_phone61)
        # print(trans_phone48)
        # print(trans_phone39)
        # print('-----')

        trans_dict[utt_name] = [trans_phone61, trans_phone48, trans_phone39]

    # Tokenize
    print('=====> Tokenize...')
    phone2idx_61 = Phone2idx(phone61_vocab_map_file_path)
    phone2idx_48 = Phone2idx(phone48_vocab_map_file_path)
    phone2idx_39 = Phone2idx(phone39_vocab_map_file_path)
    for utt_name, [trans_phone61, trans_phone48,
                   trans_phone39] in tqdm(trans_dict.items()):
        if is_test:
            trans_dict[utt_name] = [
                trans_phone61, trans_phone48, trans_phone39
            ]
            # NOTE: save as it is
        else:
            phone61_indices = phone2idx_61(trans_phone61)
            phone48_indices = phone2idx_48(trans_phone48)
            phone39_indices = phone2idx_39(trans_phone39)

            phone61_indices = ' '.join(list(map(str,
                                                phone61_indices.tolist())))
            phone48_indices = ' '.join(list(map(str,
                                                phone48_indices.tolist())))
            phone39_indices = ' '.join(list(map(str,
                                                phone39_indices.tolist())))

            trans_dict[utt_name] = [
                phone61_indices, phone48_indices, phone39_indices
            ]
    return trans_dict
Exemplo n.º 30
0
def main(data_size):

    print('=' * 50)
    print('  data_size: %s' % data_size)
    print('=' * 50)

    ########################################
    # labels
    ########################################
    print('=> Processing transcripts...')
    speaker_dict_dict = {}  # dict of speaker_dict
    print('---------- train ----------')
    if data_size == '300h':
        speaker_dict_dict['train'] = read_trans(
            label_paths=path.trans(corpus='swbd'),
            word_boundary_paths=path.word(corpus='swbd'),
            run_root_path='./',
            vocab_file_save_path=mkdir_join('./config/vocab_files'),
            save_vocab_file=True)
    elif data_size == '2000h':
        speaker_dict_a, char_set_a, char_capital_set_a, word_count_dict_a = read_trans_fisher(
            label_paths=path.trans(corpus='fisher'),
            target_speaker='A')
        speaker_dict_b, char_set_b, char_capital_set_b, word_count_dict_b = read_trans_fisher(
            label_paths=path.trans(corpus='fisher'),
            target_speaker='B')

        # Meage 2 dictionaries
        speaker_dict = merge_dicts([speaker_dict_a, speaker_dict_b])
        char_set = char_set_a | char_set_b
        char_capital_set = char_capital_set_a | char_capital_set_b
        word_count_dict_fisher = dict(
            Counter(word_count_dict_a) + Counter(word_count_dict_b))

        speaker_dict_dict['train'] = read_trans(
            label_paths=path.trans(corpus='swbd'),
            word_boundary_paths=path.word(corpus='swbd'),
            run_root_path='./',
            vocab_file_save_path=mkdir_join('./config/vocab_files'),
            save_vocab_file=True,
            speaker_dict_fisher=speaker_dict,
            char_set=char_set,
            char_capital_set=char_capital_set,
            word_count_dict=word_count_dict_fisher)
        del speaker_dict

    print('---------- eval2000 (swbd + ch) ----------')
    speaker_dict_dict['eval2000_swbd'], speaker_dict_dict['eval2000_ch'] = read_stm(
        stm_path=path.stm_path,
        pem_path=path.pem_path,
        glm_path=path.glm_path,
        run_root_path='./')

    ########################################
    # inputs
    ########################################
    print('\n=> Processing input data...')
    input_save_path = mkdir_join(
        args.feature_save_path, args.save_format, data_size)
    for data_type in ['train', 'eval2000_swbd', 'eval2000_ch']:
        print('---------- %s ----------' % data_type)
        if isfile(join(input_save_path, data_type, 'complete.txt')):
            print('Already exists.')
        else:
            if args.save_format == 'wav':
                ########################################
                # Split WAV files per utterance
                ########################################
                if data_type == 'train':
                    wav_paths = path.wav(corpus='swbd')
                    if data_size == '2000h':
                        wav_paths += path.wav(corpus='fisher')
                else:
                    wav_paths = path.wav(corpus=data_type)

                split_wav(wav_paths=wav_paths,
                          speaker_dict=speaker_dict_dict[data_type],
                          save_path=mkdir_join(input_save_path, data_type))
                # NOTE: ex.) save_path:
                # swbd/feature/save_format/data_size/data_type/speaker/utt_name.npy

            elif args.save_format in ['numpy', 'htk']:
                if data_type == 'train':
                    if args.tool == 'htk':
                        audio_paths = path.htk(corpus='swbd')
                        if data_size == '2000h':
                            audio_paths += path.htk(corpus='fisher')
                    else:
                        audio_paths = path.wav(corpus='swbd')
                        if data_size == '2000h':
                            audio_paths += path.wav(corpus='fisher')
                    is_training = True
                    global_mean, global_std = None, None
                else:
                    if args.tool == 'htk':
                        audio_paths = path.htk(corpus=data_type)
                    else:
                        audio_paths = path.wav(corpus=data_type)
                    is_training = False

                    # Load statistics over train dataset
                    global_mean = np.load(
                        join(input_save_path, 'train/global_mean.npy'))
                    global_std = np.load(
                        join(input_save_path, 'train/global_std.npy'))

                read_audio(audio_paths=audio_paths,
                           tool=args.tool,
                           config=CONFIG,
                           normalize=args.normalize,
                           speaker_dict=speaker_dict_dict[data_type],
                           is_training=is_training,
                           save_path=mkdir_join(input_save_path, data_type),
                           save_format=args.save_format,
                           global_mean=global_mean,
                           global_std=global_std)
                # NOTE: ex.) save_path:
                # swbd/feature/save_format/data_size/data_type/speaker/*.npy

            # Make a confirmation file to prove that dataset was saved
            # correctly
            with open(join(input_save_path, data_type, 'complete.txt'), 'w') as f:
                f.write('')

        ########################################
        # dataset (csv)
        ########################################
        print('\n=> Saving dataset files...')
        dataset_save_path = mkdir_join(
            args.dataset_save_path, args.save_format, data_size, data_type)

        print('---------- %s ----------' % data_type)
        df_char = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_char_capital = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_word_freq1 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_word_freq5 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_word_freq10 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])
        df_word_freq15 = pd.DataFrame(
            [], columns=['frame_num', 'input_path', 'transcript'])

        with open(join(input_save_path, data_type, 'frame_num.pickle'), 'rb') as f:
            frame_num_dict = pickle.load(f)

        utt_count = 0
        df_char_list, df_char_capital_list = [], []
        df_word_freq1_list, df_word_freq5_list = [], []
        df_word_freq10_list, df_word_freq15_list = [], []
        speaker_dict = speaker_dict_dict[data_type]
        for speaker, utt_dict in tqdm(speaker_dict.items()):
            for utt_index, utt_info in utt_dict.items():
                if args.save_format == 'numpy':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker, speaker + '_' + utt_index + '.npy')
                elif args.save_format == 'htk':
                    input_utt_save_path = join(
                        input_save_path, data_type, speaker, speaker + '_' + utt_index + '.htk')
                elif args.save_format == 'wav':
                    input_utt_save_path = path.utt2wav(utt_index)
                else:
                    raise ValueError('save_format is numpy or htk or wav.')

                assert isfile(input_utt_save_path)
                frame_num = frame_num_dict[speaker + '_' + utt_index]

                char_indices, char_indices_capital, word_freq1_indices = utt_info[2:5]
                word_freq5_indices, word_freq10_indices, word_freq15_indices = utt_info[5:8]

                series_char = pd.Series(
                    [frame_num, input_utt_save_path, char_indices],
                    index=df_char.columns)
                series_char_capital = pd.Series(
                    [frame_num, input_utt_save_path, char_indices_capital],
                    index=df_char_capital.columns)
                series_word_freq1 = pd.Series(
                    [frame_num, input_utt_save_path, word_freq1_indices],
                    index=df_word_freq1.columns)
                series_word_freq5 = pd.Series(
                    [frame_num, input_utt_save_path, word_freq5_indices],
                    index=df_word_freq5.columns)
                series_word_freq10 = pd.Series(
                    [frame_num, input_utt_save_path, word_freq10_indices],
                    index=df_word_freq10.columns)
                series_word_freq15 = pd.Series(
                    [frame_num, input_utt_save_path, word_freq15_indices],
                    index=df_word_freq15.columns)

                df_char = df_char.append(series_char, ignore_index=True)
                df_char_capital = df_char_capital.append(
                    series_char_capital, ignore_index=True)
                df_word_freq1 = df_word_freq1.append(
                    series_word_freq1, ignore_index=True)
                df_word_freq5 = df_word_freq5.append(
                    series_word_freq5, ignore_index=True)
                df_word_freq10 = df_word_freq10.append(
                    series_word_freq10, ignore_index=True)
                df_word_freq15 = df_word_freq15.append(
                    series_word_freq15, ignore_index=True)
                utt_count += 1

                # Reset
                if utt_count == 10000:
                    df_char_list.append(df_char)
                    df_char_capital_list.append(df_char_capital)
                    df_word_freq1_list.append(df_word_freq1)
                    df_word_freq5_list.append(df_word_freq5)
                    df_word_freq10_list.append(df_word_freq10)
                    df_word_freq15_list.append(df_word_freq15)

                    df_char = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_char_capital = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_word_freq1 = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_word_freq5 = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_word_freq10 = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    df_word_freq15 = pd.DataFrame(
                        [], columns=['frame_num', 'input_path', 'transcript'])
                    utt_count = 0

        # Last dataframe
        df_char_list.append(df_char)
        df_char_capital_list.append(df_char_capital)
        df_word_freq1_list.append(df_word_freq1)
        df_word_freq5_list.append(df_word_freq5)
        df_word_freq10_list.append(df_word_freq10)
        df_word_freq15_list.append(df_word_freq15)

        # Concatenate all dataframes
        df_char = df_char_list[0]
        df_char_capital = df_char_capital_list[0]
        df_word_freq1 = df_word_freq1_list[0]
        df_word_freq5 = df_word_freq5_list[0]
        df_word_freq10 = df_word_freq10_list[0]
        df_word_freq15 = df_word_freq15_list[0]

        for df_i in df_char_list[1:]:
            df_char = pd.concat([df_char, df_i], axis=0)
        for df_i in df_char_list[1:]:
            df_char_capital = pd.concat([df_char_capital, df_i], axis=0)
        for df_i in df_word_freq1_list[1:]:
            df_word_freq1 = pd.concat([df_word_freq1, df_i], axis=0)
        for df_i in df_word_freq5_list[1:]:
            df_word_freq5 = pd.concat([df_word_freq5, df_i], axis=0)
        for df_i in df_word_freq10_list[1:]:
            df_word_freq10 = pd.concat([df_word_freq10, df_i], axis=0)
        for df_i in df_word_freq15_list[1:]:
            df_word_freq15 = pd.concat([df_word_freq15, df_i], axis=0)

        df_char.to_csv(join(dataset_save_path, 'character.csv'))
        df_char_capital.to_csv(
            join(dataset_save_path, 'character_capital_divide.csv'))
        df_word_freq1.to_csv(join(dataset_save_path, 'word_freq1.csv'))
        df_word_freq5.to_csv(join(dataset_save_path, 'word_freq5.csv'))
        df_word_freq10.to_csv(join(dataset_save_path, 'word_freq10.csv'))
        df_word_freq15.to_csv(join(dataset_save_path, 'word_freq15.csv'))