Python mkdir 예제들, utils.directory.mkdir Python 예제들

예제 #1

0

파일 보기

def plot_probs(model, dataset, eval_batch_size, save_path=None):
    """Plot CTC posteriors.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int): the batch size when evaluating the model
        save_path (string): path to save figures of CTC posteriors
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    for batch, is_new_epoch in dataset:

        # Get CTC probs
        probs = model.posteriors(batch['xs'], batch['x_lens'], temperature=1)
        # NOTE: probs: '[B, T, num_classes]'

        # Visualize
        for b in range(len(batch['xs'])):
            plot_ctc_probs(
                probs[b, : batch['x_lens'][b], :],
                frame_num=batch['x_lens'][b],
                num_stack=dataset.num_stack,
                spectrogram=batch['xs'][b, :, :40],
                save_path=join(save_path, batch['input_names'][b] + '.png'),
                figsize=(14, 7))

        if is_new_epoch:
            break

예제 #2

0

파일 보기

def plot(model, dataset, beam_width,
         eval_batch_size=None, save_path=None):
    """Visualize attention weights of attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        beam_width: (int): the size of beam
        eval_batch_size (int, optional): the batch size when evaluating the model
        save_path (string, optional): path to save attention weights plotting
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    if 'char' in dataset.label_type:
        map_fn = Idx2char(dataset.vocab_file_path,
                          capital_divide=dataset.label_type == 'character_capital_divide',
                          return_list=True)
        max_decode_len = MAX_DECODE_LEN_CHAR
    else:
        map_fn = Idx2word(dataset.vocab_file_path, return_list=True)
        max_decode_len = MAX_DECODE_LEN_WORD

    for batch, is_new_epoch in dataset:

        # Decode
        best_hyps, aw, perm_idx = model.attention_weights(
            batch['xs'], batch['x_lens'],
            beam_width=beam_width,
            max_decode_len=max_decode_len)

        ys = batch['ys'][perm_idx]
        y_lens = batch['y_lens'][perm_idx]

        for b in range(len(batch['xs'])):
            ##############################
            # Reference
            ##############################
            if dataset.is_test:
                str_ref = ys[b][0]
                # NOTE: transcript is seperated by space('_')
            else:
                # Convert from list of index to string
                str_ref = map_fn(ys[b][:y_lens[b]])

            token_list = map_fn(best_hyps[b])

            speaker = '_'.join(batch['input_names'][b].split('_')[:2])
            plot_attention_weights(
                aw[b, :len(token_list), :batch['x_lens'][b]],
                label_list=token_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                str_ref=str_ref,
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(20, 8))

        if is_new_epoch:
            break

예제 #3

0

파일 보기

파일: plot_hierarchical_ctc_probs.py 프로젝트: dsp6414/pytorch_end2end_speech_recognition

def plot(model, dataset, eval_batch_size, save_path=None):
    """
    Args:
        model: the model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int): the batch size when evaluating the model
        save_path (string): path to save figures of CTC posteriors
    """
    # Set batch size in the evaluation
    if eval_batch_size is not None:
        dataset.batch_size = eval_batch_size

    # Clean directory
    if isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    idx2word = Idx2word(dataset.vocab_file_path)
    idx2char = Idx2char(
        dataset.vocab_file_path,
        capital_divide=dataset.label_type_sub == 'character_capital_divide')

    for batch, is_new_epoch in dataset:

        # Get CTC probs
        probs = model.posteriors(batch['xs'], batch['x_lens'], temperature=1)
        probs_sub = model.posteriors(batch['xs'],
                                     batch['x_lens'],
                                     is_sub_task=True,
                                     temperature=1)
        # NOTE: probs: '[B, T, num_classes]'
        # NOTE: probs_sub: '[B, T, num_classes_sub]'

        # Decode
        best_hyps = model.decode(batch['xs'], batch['x_lens'], beam_width=1)
        best_hyps_sub = model.decode(batch['xs'],
                                     batch['x_lens'],
                                     beam_width=1,
                                     is_sub_task=True)

        # Visualize
        for b in range(len(batch['xs'])):

            # Convert from list of index to string
            str_hyp = idx2word(best_hyps[b])
            str_hyp_sub = idx2char(best_hyps_sub[b])

            speaker = batch['input_names'][b].split('_')[0]
            plot_hierarchical_ctc_probs(probs[b, :batch['x_lens'][b], :],
                                        probs_sub[b, :batch['x_lens'][b], :],
                                        frame_num=batch['x_lens'][b],
                                        num_stack=dataset.num_stack,
                                        str_hyp=str_hyp,
                                        str_hyp_sub=str_hyp_sub,
                                        save_path=mkdir_join(
                                            save_path, speaker,
                                            batch['input_names'][b] + '.png'))

        if is_new_epoch:
            break

예제 #4

0

파일 보기

파일: plot_attention_weights.py 프로젝트: dsp6414/pytorch_end2end_speech_recognition

def plot_attention(model,
                   dataset,
                   eval_batch_size,
                   beam_width,
                   length_penalty,
                   save_path=None):
    """Visualize attention weights of the attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int): the batch size when evaluating the model
        beam_width: (int): the size of beam
        length_penalty (float):
        save_path (string, optional): path to save attention weights plotting
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    idx2phone = Idx2phone(dataset.vocab_file_path)

    for batch, is_new_epoch in dataset:

        # Decode
        best_hyps, aw, perm_idx = model.decode(
            batch['xs'],
            batch['x_lens'],
            beam_width=beam_width,
            max_decode_len=MAX_DECODE_LEN_PHONE,
            length_penalty=length_penalty)

        ys = batch['ys'][perm_idx]
        y_lens = batch['y_lens'][perm_idx]

        for b in range(len(batch['xs'])):
            ##############################
            # Reference
            ##############################
            if dataset.is_test:
                str_ref = ys[b][0]
                # NOTE: transcript is seperated by space(' ')
            else:
                # Convert from list of index to string
                str_ref = idx2phone(ys[b][:y_lens[b]])

            token_list = idx2phone(best_hyps[b])

            plot_attention_weights(
                aw[b][:len(token_list), :batch['x_lens'][b]],
                label_list=token_list,
                spectrogram=batch['xs'][b, :, :40],
                str_ref=str_ref,
                save_path=join(save_path, batch['input_names'][b] + '.png'),
                figsize=(20, 8))

        if is_new_epoch:
            break

예제 #5

0

파일 보기

파일: plot_ctc_probs.py 프로젝트: dsp6414/pytorch_end2end_speech_recognition

def plot(model, dataset, eval_batch_size=None, save_path=None,
         space_index=None):
    """
    Args:
        model: the model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int, optional): the batch size when evaluating the model
        save_path (string): path to save figures of CTC posteriors
        space_index (int, optional):
    """
    # Set batch size in the evaluation
    if eval_batch_size is not None:
        dataset.batch_size = eval_batch_size

    # Clean directory
    if isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    vocab_file_path = '../metrics/vocab_files/' + \
        dataset.label_type + '_' + dataset.data_size + '.txt'
    if dataset.label_type == 'character':
        map_fn = Idx2char(vocab_file_path)
    elif dataset.label_type == 'character_capital_divide':
        map_fn = Idx2char(vocab_file_path, capital_divide=True)
    else:
        map_fn = Idx2word(vocab_file_path)

    for batch, is_new_epoch in dataset:

        # Get CTC probs
        probs = model.posteriors(batch['xs'], batch['x_lens'], temperature=1)
        # NOTE: probs: '[B, T, num_classes]'

        # Decode
        best_hyps _ = model.decode(batch['xs'], batch['x_lens'], beam_width=1)

        # Visualize
        for b in range(len(batch['xs'])):

            # Convert from list of index to string
            str_pred = map_fn(best_hyps[b])

            speaker, book = batch['input_names'][b].split('-')[:2]
            plot_ctc_probs(
                probs[b, :batch['x_lens'][b], :],
                frame_num=batch['x_lens'][b],
                num_stack=dataset.num_stack,
                space_index=space_index,
                str_pred=str_pred,
                save_path=mkdir_join(save_path, speaker, book, batch['input_names'][b] + '.png'))

        if is_new_epoch:
            break

예제 #6

0

파일 보기

def plot(model, dataset, eval_batch_size, beam_width, beam_width_sub,
         length_penalty, save_path=None):
    """Visualize attention weights of Attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int): the batch size when evaluating the model
        beam_width: (int): the size of beam in the main task
        beam_width_sub: (int): the size of beam in the sub task
        length_penalty (float):
        save_path (string, optional): path to save attention weights plotting
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    map_fn_main = Idx2word(dataset.vocab_file_path, return_list=True)
    map_fn_sub = Idx2char(dataset.vocab_file_path_sub, return_list=True)

    for batch, is_new_epoch in dataset:

        # Decode
        best_hyps, aw, perm_idx = model.decode(
            batch['xs'], batch['x_lens'],
            beam_width=beam_width,
            max_decode_len=MAX_DECODE_LEN_WORD)
        best_hyps_sub, aw_sub, _ = model.decode(
            batch['xs'], batch['x_lens'],
            beam_width=beam_width_sub,
            max_decode_len=MAX_DECODE_LEN_CHAR,
            task_index=1)

        for b in range(len(batch['xs'])):

            word_list = map_fn_main(best_hyps[b])
            char_list = map_fn_sub(best_hyps_sub[b])

            speaker = batch['input_names'][b].split('_')[0]

            plot_hierarchical_attention_weights(
                aw[b][:len(word_list), :batch['x_lens'][b]],
                aw_sub[b][:len(char_list), :batch['x_lens'][b]],
                label_list=word_list,
                label_list_sub=char_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(40, 8)
            )

        if is_new_epoch:
            break

예제 #7

0

파일 보기

파일: base.py 프로젝트: dsp6414/pytorch_end2end_speech_recognition

 def set_save_path(self, save_path):
     # Reset model directory
     model_index = 0
     save_path_tmp = save_path
     while True:
         if isfile(join(save_path_tmp, 'complete.txt')):
             # Training of the first model have been finished
             model_index += 1
             save_path_tmp = save_path + '_' + str(model_index)
         elif isfile(join(save_path_tmp, 'config.yml')):
             # Training of the first model have not been finished yet
             model_index += 1
             save_path_tmp = save_path + '_' + str(model_index)
         else:
             break
     self.save_path = mkdir(save_path_tmp)

예제 #8

0

파일 보기

파일: train_multitask_ctc.py 프로젝트: sky1170447398/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type_main'] == 'character':
        params['num_classes_main'] = 28
    elif params['label_type_main'] == 'character_capital_divide':
        params['num_classes_main'] = 72

    if params['label_type_sub'] == 'phone61':
        params['num_classes_sub'] = 61
    elif params['label_type_sub'] == 'phone48':
        params['num_classes_sub'] = 48
    elif params['label_type_sub'] == 'phone39':
        params['num_classes_sub'] = 39

    # Model setting
    model = Multitask_CTC(encoder_type=params['encoder_type'],
                          input_size=params['input_size'] *
                          params['num_stack'],
                          num_units=params['num_units'],
                          num_layers_main=params['num_layers_main'],
                          num_layers_sub=params['num_layers_sub'],
                          num_classes_main=params['num_classes_main'],
                          num_classes_sub=params['num_classes_sub'],
                          main_task_weight=params['main_task_weight'],
                          lstm_impl=params['lstm_impl'],
                          use_peephole=params['use_peephole'],
                          parameter_init=params['weight_init'],
                          clip_grad=params['clip_grad'],
                          clip_activation=params['clip_activation'],
                          num_proj=params['num_proj'],
                          weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('timit_' + model.name + '_' + params['label_type_main'] +
                 '_' + params['label_type_sub'])

    model.name += '_' + str(params['num_units'])
    model.name += '_main' + str(params['num_layers_main'])
    model.name += '_sub' + str(params['num_layers_sub'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout_input'] != 1:
        model.name += '_dropi' + str(params['dropout_input'])
    if params['dropout_hidden'] != 1:
        model.name += '_droph' + str(params['dropout_hidden'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    model.name += '_main' + str(params['main_task_weight'])

    # Set save path
    model.save_path = mkdir_join(model_save_path, 'ctc',
                                 'char_' + params['label_type_sub'],
                                 model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            # tf.gfile.DeleteRecursively(new_model_path)
            # tf.gfile.MakeDirs(new_model_path)
            # break
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

예제 #9

0

파일 보기

파일: plot_nested_attention_weights.py 프로젝트: dsp6414/pytorch_end2end_speech_recognition

def plot(model,
         dataset,
         beam_width,
         beam_width_sub,
         eval_batch_size=None,
         a2c_oracle=False,
         save_path=None):
    """Visualize attention weights of Attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        beam_width: (int): the size of beam i nteh main task
        beam_width_sub: (int): the size of beam in the sub task
        eval_batch_size (int, optional): the batch size when evaluating the model
        a2c_oracle (bool, optional):
        save_path (string, optional): path to save attention weights plotting
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    idx2word = Idx2word(dataset.vocab_file_path, return_list=True)
    idx2char = Idx2char(dataset.vocab_file_path_sub, return_list=True)

    for batch, is_new_epoch in dataset:
        batch_size = len(batch['xs'])

        if a2c_oracle:
            if dataset.is_test:
                max_label_num = 0
                for b in range(batch_size):
                    if max_label_num < len(list(batch['ys_sub'][b][0])):
                        max_label_num = len(list(batch['ys_sub'][b][0]))

                ys_sub = np.zeros((batch_size, max_label_num), dtype=np.int32)
                ys_sub -= 1  # pad with -1
                y_lens_sub = np.zeros((batch_size, ), dtype=np.int32)
                for b in range(batch_size):
                    indices = char2idx(batch['ys_sub'][b][0])
                    ys_sub[b, :len(indices)] = indices
                    y_lens_sub[b] = len(indices)
                    # NOTE: transcript is seperated by space('_')
            else:
                ys_sub = batch['ys_sub']
                y_lens_sub = batch['y_lens_sub']
        else:
            ys_sub = None
            y_lens_sub = None

        best_hyps, best_hyps_sub, aw, aw_sub, aw_dec = model.attention_weights(
            batch['xs'],
            batch['x_lens'],
            beam_width=beam_width,
            beam_width_sub=beam_width_sub,
            max_decode_len=MAX_DECODE_LEN_WORD,
            max_decode_len_sub=MAX_DECODE_LEN_CHAR,
            teacher_forcing=a2c_oracle,
            ys_sub=ys_sub,
            y_lens_sub=y_lens_sub)

        for b in range(len(batch['xs'])):
            word_list = idx2word(best_hyps[b])
            if 'word' in dataset.label_type_sub:
                char_list = idx2word(best_hyps_sub[b])
            else:
                char_list = idx2char(best_hyps_sub[b])

            # word to acoustic & character to acoustic
            plot_hierarchical_attention_weights(
                aw[b][:len(word_list), :batch['x_lens'][b]],
                aw_sub[b][:len(char_list), :batch['x_lens'][b]],
                label_list=word_list,
                label_list_sub=char_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                save_path=mkdir_join(save_path,
                                     batch['input_names'][b] + '.png'),
                figsize=(40, 8))

            # word to characater
            plot_word2char_attention_weights(
                aw_dec[b][:len(word_list), :len(char_list)],
                label_list=word_list,
                label_list_sub=char_list,
                save_path=mkdir_join(
                    save_path, batch['input_names'][b] + '_word2char.png'),
                figsize=(40, 8))

            # with open(join(save_path, speaker, batch['input_names'][b] + '.txt'), 'w') as f:
            #     f.write(batch['ys'][b][0])

        if is_new_epoch:
            break

예제 #10

0

파일 보기

파일: train_multitask_ctc.py 프로젝트: PuchoDeepLearningLabs/tensorflow_end2end_speech_recognition

def main(config_path):

    # Read a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        corpus = config['corpus']
        feature = config['feature']
        param = config['param']

    if corpus['label_type_main'] == 'character':
        output_size_main = 147
    elif corpus['label_type_main'] == 'kanji':
        output_size_main = 3386

    if corpus['label_type_second'] == 'phone':
        output_size_second = 38
    elif corpus['label_type_second'] == 'character':
        output_size_second = 147

    # Model setting
    CTCModel = load(model_type=config['model_name'])
    network = CTCModel(
        batch_size=param['batch_size'],
        input_size=feature['input_size'] * feature['num_stack'],
        num_unit=param['num_unit'],
        num_layer_main=param['num_layer_main'],
        num_layer_second=param['num_layer_second'],
        #    bottleneck_dim=param['bottleneck_dim'],
        output_size_main=output_size_main,
        output_size_second=output_size_second,
        main_task_weight=param['main_task_weight'],
        parameter_init=param['weight_init'],
        clip_grad=param['clip_grad'],
        clip_activation=param['clip_activation'],
        dropout_ratio_input=param['dropout_input'],
        dropout_ratio_hidden=param['dropout_hidden'],
        num_proj=param['num_proj'],
        weight_decay=param['weight_decay'])

    network.model_name = config['model_name'].upper()
    network.model_name += '_' + str(param['num_unit'])
    network.model_name += '_main' + str(param['num_layer_main'])
    network.model_name += '_second' + str(param['num_layer_second'])
    network.model_name += '_' + param['optimizer']
    network.model_name += '_lr' + str(param['learning_rate'])
    if param['bottleneck_dim'] != 0:
        network.model_name += '_bottoleneck' + str(param['bottleneck_dim'])
    if param['num_proj'] != 0:
        network.model_name += '_proj' + str(param['num_proj'])
    if feature['num_stack'] != 1:
        network.model_name += '_stack' + str(feature['num_stack'])
    if param['weight_decay'] != 0:
        network.model_name += '_weightdecay' + str(param['weight_decay'])
    network.model_name += '_taskweight' + str(param['main_task_weight'])
    if corpus['train_data_size'] == 'large':
        network.model_name += '_large'

    # Set save path
    network.model_dir = mkdir('/n/sd8/inaguma/result/csj/monolog/')
    network.model_dir = mkdir_join(network.model_dir, 'ctc')
    network.model_dir = mkdir_join(
        network.model_dir,
        corpus['label_type_main'] + '_' + corpus['label_type_second'])
    network.model_dir = mkdir_join(network.model_dir, network.model_name)

    # Reset model directory
    if not isfile(join(network.model_dir, 'complete.txt')):
        tf.gfile.DeleteRecursively(network.model_dir)
        tf.gfile.MakeDirs(network.model_dir)
    else:
        raise ValueError('File exists.')

    # Set process name
    setproctitle('multitaskctc_csj_' + corpus['label_type_main'] + '_' +
                 corpus['label_type_second'] + '_' + corpus['train_data_size'])

    # Save config file
    shutil.copyfile(config_path, join(network.model_dir, 'config.yml'))

    sys.stdout = open(join(network.model_dir, 'train.log'), 'w')
    print(network.model_name)
    do_train(network=network,
             optimizer=param['optimizer'],
             learning_rate=param['learning_rate'],
             batch_size=param['batch_size'],
             epoch_num=param['num_epoch'],
             label_type_main=corpus['label_type_main'],
             label_type_second=corpus['label_type_second'],
             num_stack=feature['num_stack'],
             num_skip=feature['num_skip'],
             train_data_size=corpus['train_data_size'])
    sys.stdout = sys.__stdout__

예제 #11

0

파일 보기

파일: plot_hierarchical_attention_weights.py 프로젝트: carolinebear/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    dataset = Dataset(
        data_save_path=args.data_save_path,
        backend=params['backend'],
        input_freq=params['input_freq'],
        use_delta=params['use_delta'],
        use_double_delta=params['use_double_delta'],
        data_type='eval1',
        # data_type='eval2',
        # data_type='eval3',
        data_size=params['data_size'],
        label_type=params['label_type'],
        label_type_sub=params['label_type_sub'],
        batch_size=args.eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'],
        num_skip=params['num_skip'],
        sort_utt=False,
        reverse=False,
        tool=params['tool'])

    params['num_classes'] = dataset.num_classes
    params['num_classes_sub'] = dataset.num_classes_sub

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    save_path = mkdir_join(args.model_path, 'att_weights')

    ######################################################################

    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    for batch, is_new_epoch in dataset:
        # Decode
        best_hyps, aw, perm_idx = model.decode(
            batch['xs'],
            batch['x_lens'],
            beam_width=args.beam_width,
            max_decode_len=MAX_DECODE_LEN_WORD,
            min_decode_len=MIN_DECODE_LEN_WORD,
            length_penalty=args.length_penalty,
            coverage_penalty=args.coverage_penalty)
        best_hyps_sub, aw_sub, _ = model.decode(
            batch['xs'],
            batch['x_lens'],
            beam_width=args.beam_width_sub,
            max_decode_len=MAX_DECODE_LEN_CHAR,
            min_decode_len=MIN_DECODE_LEN_CHAR,
            length_penalty=args.length_penalty,
            coverage_penalty=args.coverage_penalty,
            task_index=1)

        for b in range(len(batch['xs'])):

            word_list = dataset.idx2word(best_hyps[b], return_list=True)
            char_list = dataset.idx2char(best_hyps_sub[b], return_list=True)

            speaker = batch['input_names'][b].split('_')[0]

            plot_hierarchical_attention_weights(
                aw[b][:len(word_list), :batch['x_lens'][b]],
                aw_sub[b][:len(char_list), :batch['x_lens'][b]],
                label_list=word_list,
                label_list_sub=char_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(40, 8))

        if is_new_epoch:
            break

예제 #12

0

파일 보기

파일: plot_attention_weights.py 프로젝트: dsp6414/pytorch_end2end_speech_recognition

def plot_attention(model,
                   dataset,
                   max_decode_len,
                   eval_batch_size=None,
                   save_path=None):
    """Visualize attention weights of attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        eval_batch_size (int, optional): the batch size when evaluating the model
        max_decode_len (int): the length of output sequences
            to stop prediction when EOS token have not been emitted.
        save_path (string, optional): path to save attention weights plotting
    """
    # Set batch size in the evaluation
    if eval_batch_size is not None:
        dataset.batch_size = eval_batch_size

    # Clean directory
    if isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    vocab_file_path = '../metrics/vocab_files/' + \
        dataset.label_type + '_' + dataset.data_size + '.txt'
    if 'char' in dataset.label_type:
        map_fn = Idx2char(vocab_file_path)
    else:
        map_fn = Idx2word(vocab_file_path)

    for batch, is_new_epoch in dataset:

        # Decode
        best_hyps, att_weights = model.attention_weights(
            batch['xs'], batch['x_lens'], max_decode_len=max_decode_len)
        # NOTE: attention_weights: `[B, T_out, T_in]`

        # Visualize
        for b in range(len(batch['xs'])):

            # Check if the sum of attention weights equals to 1
            # print(np.sum(att_weights[b], axis=1))

            str_pred = map_fn(best_hyps[b])
            eos = True if '>' in str_pred else False

            str_pred = str_pred.split('>')[0]
            # NOTE: Trancate by <EOS>

            # Remove the last space
            if len(str_pred) > 0 and str_pred[-1] == '_':
                str_pred = str_pred[:-1]

            if eos:
                str_pred += '_>'

            speaker = batch['input_names'][b].split('_')[0]
            plot_attention_weights(attention_weights=att_weights[
                b, :len(str_pred.split('_')), :batch['x_lens'][b]],
                                   label_list=str_pred.split('_'),
                                   save_path=mkdir_join(
                                       save_path, speaker,
                                       batch['input_names'][b] + '.png'),
                                   figsize=(20, 8))

        if is_new_epoch:
            break

예제 #13

0

파일 보기

파일: plot_attention_weights.py 프로젝트: carolinebear/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    dataset = Dataset(
        data_save_path=args.data_save_path,
        backend=params['backend'],
        input_freq=params['input_freq'],
        use_delta=params['use_delta'],
        use_double_delta=params['use_double_delta'],
        data_type='eval1',
        # data_type='eval2',
        # data_type='eval3',
        data_size=params['data_size'],
        label_type=params['label_type'],
        batch_size=args.eval_batch_size, splice=params['splice'],
        num_stack=params['num_stack'], num_skip=params['num_skip'],
        sort_utt=False, reverse=False, tool=params['tool'])

    params['num_classes'] = dataset.num_classes

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    save_path = mkdir_join(args.model_path, 'att_weights')

    ######################################################################

    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    if dataset.label_type == 'word':
        map_fn = dataset.idx2word
        max_decode_len = MAX_DECODE_LEN_WORD
        min_decode_len = MIN_DECODE_LEN_WORD
    else:
        map_fn = dataset.idx2char
        max_decode_len = MAX_DECODE_LEN_CHAR
        min_decode_len = MIN_DECODE_LEN_CHAR

    for batch, is_new_epoch in dataset:
        # Decode
        best_hyps, aw, perm_idx = model.decode(
            batch['xs'], batch['x_lens'],
            beam_width=args.beam_width,
            max_decode_len=max_decode_len,
            min_decode_len=min_decode_len,
            length_penalty=args.length_penalty,
            coverage_penalty=args.coverage_penalty)

        ys = batch['ys'][perm_idx]
        y_lens = batch['y_lens'][perm_idx]

        for b in range(len(batch['xs'])):
            ##############################
            # Reference
            ##############################
            if dataset.is_test:
                str_ref = ys[b][0]
                # NOTE: transcript is seperated by space('_')
            else:
                # Convert from list of index to string
                str_ref = map_fn(ys[b][:y_lens[b]])

            token_list = map_fn(best_hyps[b], return_list=True)

            speaker = batch['input_names'][b].split('_')[0]
            plot_attention_weights(
                aw[b][:len(token_list), :batch['x_lens'][b]],
                label_list=token_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                str_ref=str_ref,
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(20, 8))

        if is_new_epoch:
            break

예제 #14

0

파일 보기

파일: train_ctc.py 프로젝트: zz12375/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['feature'] == 'fbank':
        input_size = 123
    elif params['feature'] == 'is13':
        input_size = 141

    if params['label_type'] in ['original', 'phone3']:
        params['num_classes'] = 3
    elif params['label_type'] == 'phone4':
        params['num_classes'] = 4
    elif params['label_type'] == 'phone43':
        params['num_classes'] = 43

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=input_size * params['num_stack'],
                splice=params['splice'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('tf_svc_' + model.name + '_' + params['label_type'])

    model.name += '_' + str(params['num_units'])
    model.name += '_' + str(params['num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'ctc', params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

예제 #15

0

파일 보기

파일: train_joint_ctc_attention.py 프로젝트: sky1170447398/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    params['sos_index'] = 0
    params['eos_index'] = 1
    if params['label_type'] == 'phone61':
        params['att_num_classes'] = 63
        params['ctc_num_classes'] = 61
    elif params['label_type'] == 'phone48':
        params['att_num_classes'] = 50
        params['ctc_num_classes'] = 48
    elif params['label_type'] == 'phone39':
        params['att_num_classes'] = 41
        params['ctc_num_classes'] = 39
    elif params['label_type'] == 'character':
        params['att_num_classes'] = 30
        params['ctc_num_classes'] = 28

    # Model setting
    # AttentionModel = load(model_type=config['model_name'])
    model = JointCTCAttention(
        input_size=params['input_size'],
        encoder_num_unit=params['encoder_num_unit'],
        encoder_num_layer=params['encoder_num_layer'],
        attention_dim=params['attention_dim'],
        attention_type=params['attention_type'],
        decoder_num_unit=params['decoder_num_unit'],
        decoder_num_layer=params['decoder_num_layer'],
        embedding_dim=params['embedding_dim'],
        att_num_classes=params['att_num_classes'],
        ctc_num_classes=params['ctc_num_classes'],
        att_task_weight=params['att_task_weight'],
        sos_index=params['sos_index'],
        eos_index=params['eos_index'],
        max_decode_length=params['max_decode_length'],
        # attention_smoothing=params['attention_smoothing'],
        attention_weights_tempareture=params['attention_weights_tempareture'],
        logits_tempareture=params['logits_tempareture'],
        parameter_init=params['weight_init'],
        clip_grad=params['clip_grad'],
        clip_activation_encoder=params['clip_activation_encoder'],
        clip_activation_decoder=params['clip_activation_decoder'],
        weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('timit_' + model.name + '_' + params['label_type'])

    model.name = params['model']
    model.name += '_encoder' + str(params['encoder_num_unit'])
    model.name += '_' + str(params['encoder_num_layer'])
    model.name += '_attdim' + str(params['attention_dim'])
    model.name += '_decoder' + str(params['decoder_num_unit'])
    model.name += '_' + str(params['decoder_num_layer'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    model.name += '_' + params['attention_type']
    # if bool(params['attention_smoothing']):
    #     model.name += '_smoothing'
    if params['attention_weights_tempareture'] != 1:
        model.name += '_sharpening' + \
            str(params['attention_weights_tempareture'])
    if params['weight_decay'] != 0:
        model.name += '_weightdecay' + str(params['weight_decay'])

    # Set save path
    model.save_path = mkdir_join(model_save_path, 'attention',
                                 params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            # tf.gfile.DeleteRecursively(new_model_path)
            # tf.gfile.MakeDirs(new_model_path)
            # break
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    # sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

예제 #16

0

파일 보기

파일: plot_ctc_prob.py 프로젝트: seasky100/tensorflow_end2end_speech_recognition

def plot(session, posteriors_op, model, dataset, label_type,
         num_stack=1, save_path=None, show=False):
    """Visualize label posteriors of CTC model.
    Args:
        session: session of training model
        posteriois_op: operation for computing posteriors
        model: the model to evaluate
        dataset: An instance of a `Dataset` class
        label_type (string): phone39 or phone48 or phone61 or character or
            character_capital_divide
        num_stack (int): the number of frames to stack
        save_path (string, string): path to save ctc outputs
        show (bool, optional): if True, show each figure
    """
    # Clean directory
    if isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    for data, is_new_epoch in dataset:

        # Create feed dictionary for next mini batch
        inputs, _, inputs_seq_len, input_names = data

        feed_dict = {
            model.inputs_pl_list[0]: inputs,
            model.inputs_seq_len_pl_list[0]: inputs_seq_len,
            model.keep_prob_pl_list[0]: 1.0
        }

        # Visualize
        batch_size, max_frame_num = inputs.shape[:2]
        probs = session.run(posteriors_op, feed_dict=feed_dict)
        probs = probs.reshape(-1, max_frame_num, model.num_classes)

        # Visualize
        for i_batch in range(batch_size):
            prob = probs[i_batch][:int(inputs_seq_len[0]), :]

            plt.clf()
            plt.figure(figsize=(10, 4))
            frame_num = int(inputs_seq_len[i_batch])
            times_probs = np.arange(frame_num) * num_stack / 100

            # NOTE: Blank class is set to the last class in TensorFlow
            for i in range(0, prob.shape[-1] - 1, 1):
                plt.plot(times_probs, prob[:, i])
            plt.plot(times_probs, prob[:, -1],
                     ':', label='blank', color='grey')
            plt.xlabel('Time [sec]', fontsize=12)
            plt.ylabel('Posteriors', fontsize=12)
            plt.xlim([0, frame_num * num_stack / 100])
            plt.ylim([0.05, 1.05])
            plt.xticks(list(range(0, int(frame_num * num_stack / 100) + 1, 1)))
            plt.yticks(list(range(0, 2, 1)))
            plt.legend(loc="upper right", fontsize=12)

            if show:
                plt.show()

            # Save as a png file
            if save_path is not None:
                plt.savefig(join(save_path, input_names[0] + '.png'), dvi=500)

        if is_new_epoch:
            break

예제 #17

0

파일 보기

파일: train_multitask_ctc.py 프로젝트: zz12375/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Read a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type_main'] == 'kanji':
        params['num_classes_main'] = 3386
    elif params['label_type_main'] == 'kana':
        params['num_classes_main'] = 147
    else:
        raise TypeError

    if params['label_type_sub'] == 'kana':
        params['num_classes_sub'] = 147
    elif params['label_type_sub'] == 'phone':
        params['num_classes_sub'] = 38
    else:
        TypeError

    # Model setting
    model = load(model_type=params['model'])
    model = model(batch_size=params['batch_size'],
                  input_size=params['input_size'],
                  splice=params['splice'],
                  num_stack=params['num_stack'],
                  num_units=params['num_units'],
                  num_layer_main=params['num_layer_main'],
                  num_layer_sub=params['num_layer_sub'],
                  #    bottleneck_dim=params['bottleneck_dim'],
                  num_classes_main=params['num_classes_main'],
                  num_classes_sub=params['num_classes_sub'],
                  main_task_weight=params['main_task_weight'],
                  parameter_init=params['weight_init'],
                  clip_grad_norm=params['clip_grad_norm'],
                  clip_activation=params['clip_activation'],
                  num_proj=params['num_proj'],
                  weight_decay=params['weight_decay'])

    model.model_name = params['model']
    model.model_name += '_' + str(params['num_units'])
    model.model_name += '_main' + str(params['num_layer_main'])
    model.model_name += '_sub' + str(params['num_layer_sub'])
    model.model_name += '_' + params['optimizer']
    model.model_name += '_lr' + str(params['learning_rate'])
    if params['bottleneck_dim'] != 0:
        model.model_name += '_bottoleneck' + str(params['bottleneck_dim'])
    if params['num_proj'] != 0:
        model.model_name += '_proj' + str(params['num_proj'])
    if params['num_stack'] != 1:
        model.model_name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.model_name += '_weightdecay' + str(params['weight_decay'])
    model.model_name += '_taskweight' + str(params['main_task_weight'])
    if params['train_data_size'] == 'large':
        model.model_name += '_large'

    # Set save path
    model.save_path = mkdir(model_save_path)
    model.save_path = mkdir_join(model.save_path, 'ctc')
    model.save_path = mkdir_join(
        model.save_path,
        params['label_type_main'] + '_' + params['label_type_sub'])
    model.save_path = mkdir_join(model.save_path, model.model_name)

    # Reset model directory
    if not isfile(join(model.save_path, 'complete.txt')):
        tf.gfile.DeleteRecursively(model.save_path)
        tf.gfile.MakeDirs(model.save_path)
    else:
        raise ValueError('File exists.')

    # Set process name
    setproctitle('csj_multictc_' + params['label_type_main'] + '_' +
                 params['label_type_sub'] + '_' + params['train_data_size'])

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    do_train(model=model, params=params)

예제 #18

0

파일 보기

파일: train_joint_ctc_attention.py 프로젝트: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a <SOS> and <EOS> class
    if params['label_type'] == 'phone61':
        params['num_classes'] = 61
    elif params['label_type'] == 'phone48':
        params['num_classes'] = 48
    elif params['label_type'] == 'phone39':
        params['num_classes'] = 39
    elif params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        params['num_classes'] = 72
    else:
        raise TypeError

    # Model setting
    model = JointCTCAttention(
        input_size=params['input_size'] * params['num_stack'],
        encoder_type=params['encoder_type'],
        encoder_num_units=params['encoder_num_units'],
        encoder_num_layers=params['encoder_num_layers'],
        encoder_num_proj=params['encoder_num_proj'],
        attention_type=params['attention_type'],
        attention_dim=params['attention_dim'],
        decoder_type=params['decoder_type'],
        decoder_num_units=params['decoder_num_units'],
        decoder_num_layers=params['decoder_num_layers'],
        embedding_dim=params['embedding_dim'],
        lambda_weight=params['lambda_weight'],
        num_classes=params['num_classes'],
        sos_index=params['num_classes'],
        eos_index=params['num_classes'] + 1,
        max_decode_length=params['max_decode_length'],
        lstm_impl='LSTMBlockCell',
        use_peephole=params['use_peephole'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        clip_activation_encoder=params['clip_activation_encoder'],
        clip_activation_decoder=params['clip_activation_decoder'],
        weight_decay=params['weight_decay'],
        time_major=True,
        sharpening_factor=params['sharpening_factor'],
        logits_temperature=params['logits_temperature'])

    # Set process name
    setproctitle('tf_timit_' + model.name + '_' +
                 params['label_type'] + '_' + params['attention_type'])

    model.name += '_en' + str(params['encoder_num_units'])
    model.name += '_' + str(params['encoder_num_layers'])
    model.name += '_att' + str(params['attention_dim'])
    model.name += '_de' + str(params['decoder_num_units'])
    model.name += '_' + str(params['decoder_num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    model.name += '_' + params['attention_type']
    if params['dropout_encoder'] != 0:
        model.name += '_dropen' + str(params['dropout_encoder'])
    if params['dropout_decoder'] != 0:
        model.name += '_dropde' + str(params['dropout_decoder'])
    if params['dropout_embedding'] != 0:
        model.name += '_dropem' + str(params['dropout_embedding'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += 'wd' + str(params['weight_decay'])
    if params['sharpening_factor'] != 1:
        model.name += '_sharp' + str(params['sharpening_factor'])
    if params['logits_temperature'] != 1:
        model.name += '_temp' + str(params['logits_temperature'])
    model.name += '_lambda' + str(params['lambda_weight'])

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'joint_ctc_attention', params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

예제 #19

0

파일 보기

def plot(session,
         decode_op,
         attention_weights_op,
         model,
         dataset,
         label_type,
         is_test=False,
         save_path=None,
         show=False):
    """Visualize attention weights of Attetnion-based model.
    Args:
        session: session of training model
        decode_op: operation for decoding
        attention_weights_op: operation for computing attention weights
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        label_type (string, optional): phone39 or phone48 or phone61 or character or
            character_capital_divide
        is_test (bool, optional):
        save_path (string, optional): path to save attention weights plotting
        show (bool, optional): if True, show each figure
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    if label_type == 'character':
        map_fn = Idx2char(
            map_file_path='../metrics/mapping_files/character.txt')
    elif label_type == 'character_capital_divide':
        map_fn = Idx2char(
            map_file_path=
            '../metrics/mapping_files/character_capital_divide.txt',
            capital_divide=True)
    else:
        map_fn = Idx2phone(map_file_path='../metrics/mapping_files/' +
                           label_type + '.txt')

    for data, is_new_epoch in dataset:

        # Create feed dictionary for next mini batch
        inputs, labels_true, inputs_seq_len, _, input_names = data

        feed_dict = {
            model.inputs_pl_list[0]: inputs[0],
            model.inputs_seq_len_pl_list[0]: inputs_seq_len[0],
            model.keep_prob_encoder_pl_list[0]: 1.0,
            model.keep_prob_decoder_pl_list[0]: 1.0,
            model.keep_prob_embedding_pl_list[0]: 1.0
        }

        # Visualize
        batch_size, max_frame_num = inputs.shape[:2]
        attention_weights, labels_pred = session.run(
            [attention_weights_op, decode_op], feed_dict=feed_dict)

        for i_batch in range(batch_size):

            # t_out, t_in = attention_weights[i_batch].shape

            # Check if the sum of attention weights equals to 1
            # print(np.sum(attention_weights[i_batch], axis=1))

            # Convert from index to label
            str_pred = map_fn(labels_pred[i_batch])
            if 'phone' in label_type:
                label_list = str_pred.split(' ')
            else:
                raise NotImplementedError

            plt.clf()
            plt.figure(figsize=(10, 4))
            sns.heatmap(attention_weights[i_batch],
                        cmap='Blues',
                        xticklabels=False,
                        yticklabels=label_list)

            plt.xlabel('Input frames', fontsize=12)
            plt.ylabel('Output labels (top to bottom)', fontsize=12)

            if show:
                plt.show()

            # Save as a png file
            if save_path is not None:
                plt.savefig(join(save_path, input_names[0] + '.png'), dvi=500)

        if is_new_epoch:
            break

예제 #20

0

파일 보기

파일: plot_ctc_prob.py 프로젝트: zz12375/tensorflow_end2end_speech_recognition

def plot(session,
         posteriors_op,
         model,
         dataset,
         label_type,
         num_stack=1,
         save_path=None,
         show=False):
    """Visualize label posteriors of CTC model.
    Args:
        session: session of training model
        posteriois_op: operation for computing posteriors
        model: the model to evaluate
        dataset: An instance of a `Dataset` class
        label_type (string): phone39 or phone48 or phone61 or character or
            character_capital_divide
        num_stack (int): the number of frames to stack
        save_path (string, string): path to save ctc outputs
        show (bool, optional): if True, show each figure
    """
    # Clean directory
    if isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    for data, is_new_epoch in dataset:

        # Create feed dictionary for next mini batch
        inputs, _, inputs_seq_len, input_names = data

        feed_dict = {
            model.inputs_pl_list[0]: inputs,
            model.inputs_seq_len_pl_list[0]: inputs_seq_len,
            model.keep_prob_pl_list[0]: 1.0
        }

        # Visualize
        batch_size, max_frame_num = inputs.shape[:2]
        probs = session.run(posteriors_op, feed_dict=feed_dict)
        probs = probs.reshape(-1, max_frame_num, model.num_classes)

        # Visualize
        for i_batch in range(batch_size):
            prob = probs[i_batch][:int(inputs_seq_len[0]), :]

            plt.clf()
            plt.figure(figsize=(10, 4))
            frame_num = int(inputs_seq_len[i_batch])
            times_probs = np.arange(frame_num) * num_stack / 100

            # NOTE: Blank class is set to the last class in TensorFlow
            for i in range(0, prob.shape[-1] - 1, 1):
                plt.plot(times_probs, prob[:, i])
            plt.plot(times_probs,
                     prob[:, -1],
                     ':',
                     label='blank',
                     color='grey')
            plt.xlabel('Time [sec]', fontsize=12)
            plt.ylabel('Posteriors', fontsize=12)
            plt.xlim([0, frame_num * num_stack / 100])
            plt.ylim([0.05, 1.05])
            plt.xticks(list(range(0, int(frame_num * num_stack / 100) + 1, 1)))
            plt.yticks(list(range(0, 2, 1)))
            plt.legend(loc="upper right", fontsize=12)

            if show:
                plt.show()

            # Save as a png file
            if save_path is not None:
                plt.savefig(join(save_path, input_names[0] + '.png'), dvi=500)

        if is_new_epoch:
            break

예제 #21

0

파일 보기

파일: train_ctc.py 프로젝트: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type'] == 'kana':
        params['num_classes'] = 146
    elif params['label_type'] == 'kana_divide':
        params['num_classes'] = 147
    elif params['label_type'] == 'kanji':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2981
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3385
    elif params['label_type'] == 'kanji_divide':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2982
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3386
    else:
        raise TypeError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    # Set process name
    setproctitle(
        'tf_csj_' + model.name + '_' + params['train_data_size'] + '_' + params['label_type'])

    model.name += '_' + str(params['num_units'])
    model.name += '_' + str(params['num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    if params['bottleneck_dim'] != 0:
        model.name += '_bottle' + str(params['bottleneck_dim'])
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'ctc', params['label_type'],
        params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

예제 #22

0

파일 보기

def main(config_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        corpus = config['corpus']
        feature = config['feature']
        param = config['param']

    if corpus['label_type'] == 'phone61':
        output_size = 63
    elif corpus['label_type'] == 'phone48':
        output_size = 50
    elif corpus['label_type'] == 'phone39':
        output_size = 41
    elif corpus['label_type'] == 'character':
        output_size = 33

    # Model setting
    # AttentionModel = load(model_type=config['model_name'])
    network = blstm_attention_seq2seq.BLSTMAttetion(
        batch_size=param['batch_size'],
        input_size=feature['input_size'],
        encoder_num_unit=param['encoder_num_unit'],
        encoder_num_layer=param['encoder_num_layer'],
        attention_dim=param['attention_dim'],
        decoder_num_unit=param['decoder_num_unit'],
        decoder_num_layer=param['decoder_num_layer'],
        embedding_dim=param['embedding_dim'],
        output_size=output_size,
        sos_index=output_size - 2,
        eos_index=output_size - 1,
        max_decode_length=param['max_decode_length'],
        attention_weights_tempareture=param['attention_weights_tempareture'],
        logits_tempareture=param['logits_tempareture'],
        parameter_init=param['weight_init'],
        clip_grad=param['clip_grad'],
        clip_activation_encoder=param['clip_activation_encoder'],
        clip_activation_decoder=param['clip_activation_decoder'],
        dropout_ratio_input=param['dropout_input'],
        dropout_ratio_hidden=param['dropout_hidden'],
        weight_decay=param['weight_decay'])

    network.model_name = config['model_name'].upper()
    network.model_name += '_encoder' + str(param['encoder_num_unit'])
    network.model_name += '_' + str(param['encoder_num_layer'])
    network.model_name += '_attdim' + str(param['attention_dim'])
    network.model_name += '_decoder' + str(param['decoder_num_unit'])
    network.model_name += '_' + str(param['decoder_num_layer'])
    network.model_name += '_' + param['optimizer']
    network.model_name += '_lr' + str(param['learning_rate'])
    if param['weight_decay'] != 0:
        network.model_name += '_weightdecay' + str(param['weight_decay'])

    # Set save path
    network.model_dir = mkdir('/n/sd8/inaguma/result/timit/attention/')
    network.model_dir = mkdir_join(network.model_dir, corpus['label_type'])
    network.model_dir = mkdir_join(network.model_dir, network.model_name)

    # Reset model directory
    if not isfile(join(network.model_dir, 'complete.txt')):
        tf.gfile.DeleteRecursively(network.model_dir)
        tf.gfile.MakeDirs(network.model_dir)
    else:
        raise ValueError('File exists.')

    # Set process name
    setproctitle('attention_timit_' + corpus['label_type'])

    # Save config file
    shutil.copyfile(config_path, join(network.model_dir, 'config.yml'))

    sys.stdout = open(join(network.model_dir, 'train.log'), 'w')
    print(network.model_name)
    do_train(network=network,
             optimizer=param['optimizer'],
             learning_rate=param['learning_rate'],
             batch_size=param['batch_size'],
             epoch_num=param['num_epoch'],
             label_type=corpus['label_type'],
             eos_index=output_size - 1)
    sys.stdout = sys.__stdout__

예제 #23

0

파일 보기

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    params['num_classes'] = 28

    # Model setting
    model = StudentCTC(
        encoder_type=params['encoder_type'],
        input_size=params['input_size'] *
        params['num_stack'] * params['splice'],
        splice=params['splice'],
        num_stack=params['num_stack'],
        num_classes=params['num_classes'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        weight_decay=params['weight_decay'])

    # Set process name
    setproctitle(
        'tf_libri_' + model.name + '_' + params['train_data_size'] + '_' + params['label_type'])

    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'student_ctc', params['label_type'],
        params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

예제 #24

0

파일 보기

def plot(model,
         dataset,
         beam_width,
         beam_width_sub,
         eval_batch_size=None,
         save_path=None):
    """Visualize attention weights of Attetnion-based model.
    Args:
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        beam_width: (int): the size of beam in the main task
        beam_width_sub: (int): the size of beam in the sub task
        eval_batch_size (int, optional): the batch size when evaluating the model
        save_path (string, optional): path to save attention weights plotting
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    idx2word = Idx2word(dataset.vocab_file_path, return_list=True)
    idx2char = Idx2char(dataset.vocab_file_path_sub, return_list=True)

    for batch, is_new_epoch in dataset:

        best_hyps, best_hyps_sub, aw, aw_sub, aw_dec = model.attention_weights(
            batch['xs'],
            batch['x_lens'],
            beam_width=beam_width,
            beam_width_sub=beam_width_sub,
            max_decode_len=MAX_DECODE_LEN_WORD,
            max_decode_len_sub=MAX_DECODE_LEN_CHAR)

        for b in range(len(batch['xs'])):

            word_list = idx2word(best_hyps[b])
            char_list = idx2char(best_hyps_sub[b])

            # if word_list.count('OOV') < 1:
            #     continue

            speaker = '_'.join(batch['input_names'][b].split('_')[:2])

            # word to acoustic & character to acoustic
            plot_hierarchical_attention_weights(
                aw[b][:len(word_list), :batch['x_lens'][b]],
                aw_sub[b][:len(char_list), :batch['x_lens'][b]],
                label_list=word_list,
                label_list_sub=char_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(50, 10))

            # word to characater attention
            plot_word2char_attention_weights(
                aw_dec[b][:len(word_list), :len(char_list)],
                label_list=word_list,
                label_list_sub=char_list,
                save_path=mkdir_join(
                    save_path, speaker,
                    batch['input_names'][b] + '_word2char.png'),
                figsize=(50, 10))

            with open(
                    join(save_path, speaker, batch['input_names'][b] + '.txt'),
                    'w') as f:
                f.write(batch['ys'][b][0])

        if is_new_epoch:
            break

예제 #25

0

파일 보기

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a <SOS> and <EOS> class
    if params['label_type'] == 'phone61':
        params['num_classes'] = 61
    elif params['label_type'] == 'phone48':
        params['num_classes'] = 48
    elif params['label_type'] == 'phone39':
        params['num_classes'] = 39
    elif params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        params['num_classes'] = 72
    else:
        raise TypeError

    # Model setting
    model = AttentionSeq2Seq(
        input_size=params['input_size'] * params['num_stack'],
        encoder_type=params['encoder_type'],
        encoder_num_units=params['encoder_num_units'],
        encoder_num_layers=params['encoder_num_layers'],
        encoder_num_proj=params['encoder_num_proj'],
        attention_type=params['attention_type'],
        attention_dim=params['attention_dim'],
        decoder_type=params['decoder_type'],
        decoder_num_units=params['decoder_num_units'],
        decoder_num_layers=params['decoder_num_layers'],
        embedding_dim=params['embedding_dim'],
        num_classes=params['num_classes'],
        sos_index=params['num_classes'],
        eos_index=params['num_classes'] + 1,
        max_decode_length=params['max_decode_length'],
        lstm_impl='LSTMBlockCell',
        use_peephole=params['use_peephole'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        clip_activation_encoder=params['clip_activation_encoder'],
        clip_activation_decoder=params['clip_activation_decoder'],
        weight_decay=params['weight_decay'],
        time_major=True,
        sharpening_factor=params['sharpening_factor'],
        logits_temperature=params['logits_temperature'],
        sigmoid_smoothing=params['sigmoid_smoothing'])

    # Set process name
    setproctitle('tf_timit_' + model.name + '_' + params['label_type'] + '_' +
                 params['attention_type'])

    model.name = 'en' + str(params['encoder_num_units'])
    model.name += '_' + str(params['encoder_num_layers'])
    model.name += '_att' + str(params['attention_dim'])
    model.name += '_de' + str(params['decoder_num_units'])
    model.name += '_' + str(params['decoder_num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    model.name += '_' + params['attention_type']
    if params['dropout_encoder'] != 0:
        model.name += '_dropen' + str(params['dropout_encoder'])
    if params['dropout_decoder'] != 0:
        model.name += '_dropde' + str(params['dropout_decoder'])
    if params['dropout_embedding'] != 0:
        model.name += '_dropem' + str(params['dropout_embedding'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += 'wd' + str(params['weight_decay'])
    if params['sharpening_factor'] != 1:
        model.name += '_sharp' + str(params['sharpening_factor'])
    if params['logits_temperature'] != 1:
        model.name += '_temp' + str(params['logits_temperature'])
    if bool(params['sigmoid_smoothing']):
        model.name += '_smoothing'

    # Set save path
    model.save_path = mkdir_join(model_save_path, 'attention',
                                 params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

예제 #26

0

파일 보기

파일: train_multitask_ctc.py 프로젝트: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type_main'] == 'character':
        params['num_classes_main'] = 28
    elif params['label_type_main'] == 'character_capital_divide':
        params['num_classes_main'] = 72
    else:
        raise TypeError
    if params['label_type_sub'] == 'phone61':
        params['num_classes_sub'] = 61
    elif params['label_type_sub'] == 'phone48':
        params['num_classes_sub'] = 48
    elif params['label_type_sub'] == 'phone39':
        params['num_classes_sub'] = 39
    else:
        raise TypeError

    # Model setting
    model = MultitaskCTC(encoder_type=params['encoder_type'],
                         input_size=params['input_size'],
                         splice=params['splice'],
                         num_stack=params['num_stack'],
                         num_units=params['num_units'],
                         num_layers_main=params['num_layers_main'],
                         num_layers_sub=params['num_layers_sub'],
                         num_classes_main=params['num_classes_main'],
                         num_classes_sub=params['num_classes_sub'],
                         main_task_weight=params['main_task_weight'],
                         lstm_impl=params['lstm_impl'],
                         use_peephole=params['use_peephole'],
                         parameter_init=params['weight_init'],
                         clip_grad_norm=params['clip_grad_norm'],
                         clip_activation=params['clip_activation'],
                         num_proj=params['num_proj'],
                         weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('tf_timit_' + model.name + '_' +
                 params['label_type_main'] + '_' + params['label_type_sub'])

    model.name += '_' + str(params['num_units'])
    model.name += '_main' + str(params['num_layers_main'])
    model.name += '_sub' + str(params['num_layers_sub'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    model.name += '_main' + str(params['main_task_weight'])

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'ctc', 'char_' + params['label_type_sub'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

예제 #27

0

파일 보기

파일: plot_ctc_probs.py 프로젝트: carolinebear/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    dataset = Dataset(data_save_path=args.data_save_path,
                      backend=params['backend'],
                      input_freq=params['input_freq'],
                      use_delta=params['use_delta'],
                      use_double_delta=params['use_double_delta'],
                      data_type='test',
                      label_type=params['label_type'],
                      batch_size=args.eval_batch_size,
                      splice=params['splice'],
                      num_stack=params['num_stack'],
                      num_skip=params['num_skip'],
                      sort_utt=True,
                      reverse=True,
                      tool=params['tool'])

    params['num_classes'] = dataset.num_classes

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    save_path = mkdir_join(args.model_path, 'ctc_probs')

    ######################################################################

    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    for batch, is_new_epoch in dataset:
        # Get CTC probs
        probs, x_lens, _ = model.posteriors(batch['xs'],
                                            batch['x_lens'],
                                            temperature=1)
        # NOTE: probs: '[B, T, num_classes]'

        # Visualize
        for b in range(len(batch['xs'])):
            plot_ctc_probs(probs[b, :x_lens[b], :],
                           frame_num=x_lens[b],
                           num_stack=dataset.num_stack,
                           spectrogram=batch['xs'][b, :, :40],
                           save_path=join(save_path,
                                          batch['input_names'][b] + '.png'),
                           figsize=(14, 7))

        if is_new_epoch:
            break

예제 #28

0

파일 보기

파일: train_ctc_multigpu.py 프로젝트: sky1170447398/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        params['num_classes'] = 77
    elif params['label_type'] == 'word':
        if params['train_data_size'] == 'train_clean100':
            params['num_classes'] = 7213
        elif params['train_data_size'] == 'train_clean360':
            params['num_classes'] = 16287
        elif params['train_data_size'] == 'train_other500':
            params['num_classes'] = 18669
        elif params['train_data_size'] == 'train_all':
            params['num_classes'] = 26642

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'] * params['num_stack'],
                splice=params['splice'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad=params['clip_grad'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('libri_' + model.name + '_' + params['train_data_size'] +
                 '_' + params['label_type'])

    model.name += '_' + str(params['num_units'])
    model.name += '_' + str(params['num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout_hidden'] != 1:
        model.name += '_drop' + str(params['dropout_hidden'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    if params['bottleneck_dim'] != 0:
        model.name += '_bottle' + str(params['bottleneck_dim'])
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(model_save_path, 'ctc', params['label_type'],
                                 params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

예제 #29

0

파일 보기

파일: train_attention.py 프로젝트: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a <SOS> and <EOS> class
    if params['label_type'] == 'kana':
        params['num_classes'] = 146
    elif params['label_type'] == 'kana_divide':
        params['num_classes'] = 147
    elif params['label_type'] == 'kanji':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2981
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3385
    elif params['label_type'] == 'kanji_divide':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2982
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3386
    else:
        raise TypeError

    # Model setting
    model = AttentionSeq2Seq(
        input_size=params['input_size'] * params['num_stack'],
        encoder_type=params['encoder_type'],
        encoder_num_units=params['encoder_num_units'],
        encoder_num_layers=params['encoder_num_layers'],
        encoder_num_proj=params['encoder_num_proj'],
        attention_type=params['attention_type'],
        attention_dim=params['attention_dim'],
        decoder_type=params['decoder_type'],
        decoder_num_units=params['decoder_num_units'],
        decoder_num_layers=params['decoder_num_layers'],
        embedding_dim=params['embedding_dim'],
        num_classes=params['num_classes'],
        sos_index=params['num_classes'],
        eos_index=params['num_classes'] + 1,
        max_decode_length=params['max_decode_length'],
        lstm_impl='LSTMBlockCell',
        use_peephole=params['use_peephole'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        clip_activation_encoder=params['clip_activation_encoder'],
        clip_activation_decoder=params['clip_activation_decoder'],
        weight_decay=params['weight_decay'],
        time_major=True,
        sharpening_factor=params['sharpening_factor'],
        logits_temperature=params['logits_temperature'],
        sigmoid_smoothing=params['sigmoid_smoothing'])

    # Set process name
    setproctitle('tf_csj_' + model.name + '_' +
                 params['train_data_size'] + '_' + params['label_type'] + '_' +
                 params['attention_type'])

    model.name = 'en' + str(params['encoder_num_units'])
    model.name += '_' + str(params['encoder_num_layers'])
    model.name += '_att' + str(params['attention_dim'])
    model.name += '_de' + str(params['decoder_num_units'])
    model.name += '_' + str(params['decoder_num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    model.name += '_' + params['attention_type']
    if params['dropout_encoder'] != 0:
        model.name += '_dropen' + str(params['dropout_encoder'])
    if params['dropout_decoder'] != 0:
        model.name += '_dropde' + str(params['dropout_decoder'])
    if params['dropout_embedding'] != 0:
        model.name += '_dropem' + str(params['dropout_embedding'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += 'wd' + str(params['weight_decay'])
    if params['sharpening_factor'] != 1:
        model.name += '_sharp' + str(params['sharpening_factor'])
    if params['logits_temperature'] != 1:
        model.name += '_temp' + str(params['logits_temperature'])
    if bool(params['sigmoid_smoothing']):
        model.name += '_smoothing'
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'attention', params['label_type'],
        params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

예제 #30

0

파일 보기

파일: plot_attention_weights.py 프로젝트: seasky100/tensorflow_end2end_speech_recognition

def plot(session, decode_op, attention_weights_op, model, dataset,
         label_type, is_test=False, save_path=None, show=False):
    """Visualize attention weights of Attetnion-based model.
    Args:
        session: session of training model
        decode_op: operation for decoding
        attention_weights_op: operation for computing attention weights
        model: model to evaluate
        dataset: An instance of a `Dataset` class
        label_type (string, optional): phone39 or phone48 or phone61 or character or
            character_capital_divide
        is_test (bool, optional):
        save_path (string, optional): path to save attention weights plotting
        show (bool, optional): if True, show each figure
    """
    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    if label_type == 'character':
        map_fn = Idx2char(
            map_file_path='../metrics/mapping_files/character.txt')
    elif label_type == 'character_capital_divide':
        map_fn = Idx2char(
            map_file_path='../metrics/mapping_files/character_capital_divide.txt',
            capital_divide=True)
    else:
        map_fn = Idx2phone(
            map_file_path='../metrics/mapping_files/' + label_type + '.txt')

    for data, is_new_epoch in dataset:

        # Create feed dictionary for next mini batch
        inputs, labels_true, inputs_seq_len, _, input_names = data

        feed_dict = {
            model.inputs_pl_list[0]: inputs[0],
            model.inputs_seq_len_pl_list[0]: inputs_seq_len[0],
            model.keep_prob_encoder_pl_list[0]: 1.0,
            model.keep_prob_decoder_pl_list[0]: 1.0,
            model.keep_prob_embedding_pl_list[0]: 1.0
        }

        # Visualize
        batch_size, max_frame_num = inputs.shape[:2]
        attention_weights, labels_pred = session.run(
            [attention_weights_op, decode_op], feed_dict=feed_dict)

        for i_batch in range(batch_size):

            # t_out, t_in = attention_weights[i_batch].shape

            # Check if the sum of attention weights equals to 1
            # print(np.sum(attention_weights[i_batch], axis=1))

            # Convert from index to label
            str_pred = map_fn(labels_pred[i_batch])
            if 'phone' in label_type:
                label_list = str_pred.split(' ')
            else:
                raise NotImplementedError

            plt.clf()
            plt.figure(figsize=(10, 4))
            sns.heatmap(attention_weights[i_batch],
                        cmap='Blues',
                        xticklabels=False,
                        yticklabels=label_list)

            plt.xlabel('Input frames', fontsize=12)
            plt.ylabel('Output labels (top to bottom)', fontsize=12)

            if show:
                plt.show()

            # Save as a png file
            if save_path is not None:
                plt.savefig(join(save_path, input_names[0] + '.png'), dvi=500)

        if is_new_epoch:
            break

예제 #31

0

파일 보기

def main(config_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        corpus = config['corpus']
        feature = config['feature']
        param = config['param']

    if corpus['label_type'] == 'phone61':
        output_size = 61
    elif corpus['label_type'] == 'phone48':
        output_size = 48
    elif corpus['label_type'] == 'phone39':
        output_size = 39
    elif corpus['label_type'] == 'character':
        output_size = 30

    # Model setting
    CTCModel = load(model_type=config['model_name'])
    network = CTCModel(batch_size=param['batch_size'],
                       input_size=feature['input_size'] * feature['num_stack'],
                       num_unit=param['num_unit'],
                       num_layer=param['num_layer'],
                       output_size=output_size,
                       parameter_init=param['weight_init'],
                       clip_grad=param['clip_grad'],
                       clip_activation=param['clip_activation'],
                       dropout_ratio_input=param['dropout_input'],
                       dropout_ratio_hidden=param['dropout_hidden'],
                       num_proj=param['num_proj'],
                       weight_decay=param['weight_decay'])

    network.model_name = config['model_name'].upper()
    network.model_name += '_' + str(param['num_unit'])
    network.model_name += '_' + str(param['num_layer'])
    network.model_name += '_' + param['optimizer']
    network.model_name += '_lr' + str(param['learning_rate'])
    if param['num_proj'] != 0:
        network.model_name += '_proj' + str(param['num_proj'])
    if feature['num_stack'] != 1:
        network.model_name += '_stack' + str(feature['num_stack'])
    if param['weight_decay'] != 0:
        network.model_name += '_weightdecay' + str(param['weight_decay'])
    network.model_name += '_' + str(len(gpu_indices)) + 'gpu'

    # Set save path
    network.model_dir = mkdir('/n/sd8/inaguma/result/timit/ctc/')
    network.model_dir = mkdir_join(network.model_dir, corpus['label_type'])
    network.model_dir = mkdir_join(network.model_dir, network.model_name)

    # Reset model directory
    if not isfile(join(network.model_dir, 'complete.txt')):
        tf.gfile.DeleteRecursively(network.model_dir)
        tf.gfile.MakeDirs(network.model_dir)
    else:
        raise ValueError('File exists.')

    # Set process name
    setproctitle('multigpu_ctc_timit_' + corpus['label_type'])

    # Save config file
    shutil.copyfile(config_path, join(network.model_dir, 'config.yml'))

    sys.stdout = open(join(network.model_dir, 'train.log'), 'w')
    print(network.model_name)
    do_train(network=network,
             optimizer=param['optimizer'],
             learning_rate=param['learning_rate'],
             batch_size=param['batch_size'],
             epoch_num=param['num_epoch'],
             label_type=corpus['label_type'],
             num_stack=feature['num_stack'],
             num_skip=feature['num_skip'],
             gpu_indices=gpu_indices)
    sys.stdout = sys.__stdout__

예제 #32

0

파일 보기

파일: plot_nested_attention_weights.py 프로젝트: carolinebear/pytorch_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load a config file (.yml)
    params = load_config(join(args.model_path, 'config.yml'), is_eval=True)

    # Load dataset
    dataset = Dataset(
        data_save_path=args.data_save_path,
        backend=params['backend'],
        input_freq=params['input_freq'],
        use_delta=params['use_delta'],
        use_double_delta=params['use_double_delta'],
        data_type='eval1',
        # data_type='eval2',
        # data_type='eval3',
        data_size=params['data_size'],
        label_type=params['label_type'],
        label_type_sub=params['label_type_sub'],
        batch_size=args.eval_batch_size,
        splice=params['splice'],
        num_stack=params['num_stack'],
        num_skip=params['num_skip'],
        sort_utt=False,
        reverse=False,
        tool=params['tool'])

    params['num_classes'] = dataset.num_classes
    params['num_classes_sub'] = dataset.num_classes_sub

    # Load model
    model = load(model_type=params['model_type'],
                 params=params,
                 backend=params['backend'])

    # Restore the saved parameters
    model.load_checkpoint(save_path=args.model_path, epoch=args.epoch)

    # GPU setting
    model.set_cuda(deterministic=False, benchmark=True)

    a2c_oracle = False

    save_path = mkdir_join(args.model_path, 'att_weights')

    ######################################################################

    # Clean directory
    if save_path is not None and isdir(save_path):
        shutil.rmtree(save_path)
        mkdir(save_path)

    for batch, is_new_epoch in dataset:
        batch_size = len(batch['xs'])

        if a2c_oracle:
            if dataset.is_test:
                max_label_num = 0
                for b in range(batch_size):
                    if max_label_num < len(list(batch['ys_sub'][b][0])):
                        max_label_num = len(list(batch['ys_sub'][b][0]))

                ys_sub = np.zeros((batch_size, max_label_num), dtype=np.int32)
                ys_sub -= 1  # pad with -1
                y_lens_sub = np.zeros((batch_size, ), dtype=np.int32)
                for b in range(batch_size):
                    indices = dataset.char2idx(batch['ys_sub'][b][0])
                    ys_sub[b, :len(indices)] = indices
                    y_lens_sub[b] = len(indices)
                    # NOTE: transcript is seperated by space('_')
            else:
                ys_sub = batch['ys_sub']
                y_lens_sub = batch['y_lens_sub']
        else:
            ys_sub = None
            y_lens_sub = None

        best_hyps, aw, best_hyps_sub, aw_sub, aw_dec, _ = model.decode(
            batch['xs'],
            batch['x_lens'],
            beam_width=args.beam_width,
            max_decode_len=MAX_DECODE_LEN_WORD,
            min_decode_len=MIN_DECODE_LEN_WORD,
            beam_width_sub=args.beam_width_sub,
            max_decode_len_sub=MAX_DECODE_LEN_CHAR,
            min_decode_len_sub=MIN_DECODE_LEN_CHAR,
            length_penalty=args.length_penalty,
            coverage_penalty=args.coverage_penalty,
            teacher_forcing=a2c_oracle,
            ys_sub=ys_sub,
            y_lens_sub=y_lens_sub)

        for b in range(len(batch['xs'])):
            word_list = dataset.idx2word(best_hyps[b], return_list=True)
            if dataset.label_type_sub == 'word':
                char_list = dataset.idx2word(best_hyps_sub[b],
                                             return_list=True)
            else:
                char_list = dataset.idx2char(best_hyps_sub[b],
                                             return_list=True)

            speaker = batch['input_names'][b].split('_')[0]

            # word to acoustic & character to acoustic
            plot_hierarchical_attention_weights(
                aw[b][:len(word_list), :batch['x_lens'][b]],
                aw_sub[b][:len(char_list), :batch['x_lens'][b]],
                label_list=word_list,
                label_list_sub=char_list,
                spectrogram=batch['xs'][b, :, :dataset.input_freq],
                save_path=mkdir_join(save_path, speaker,
                                     batch['input_names'][b] + '.png'),
                figsize=(40, 8))

            # word to characater
            plot_nested_attention_weights(
                aw_dec[b][:len(word_list), :len(char_list)],
                label_list=word_list,
                label_list_sub=char_list,
                save_path=mkdir_join(
                    save_path, speaker,
                    batch['input_names'][b] + '_word2char.png'),
                figsize=(40, 8))

            # with open(join(save_path, speaker, batch['input_names'][b] + '.txt'), 'w') as f:
            #     f.write(batch['ys'][b][0])

        if is_new_epoch:
            break