Exemplos de CTC em Python, exemplos de models.ctc.ctc.CTC em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: eval_ctc_temp.py Projeto: seasky100/tensorflow_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load config file
    with open(os.path.join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type'] == 'character':
        params['num_classes'] = 28

    # Model setting
    model = CTC(
        encoder_type=params['encoder_type'],
        input_size=params['input_size'] * params['num_stack'],
        splice=params['splice'],
        num_units=params['num_units'],
        num_layers=params['num_layers'],
        num_classes=params['num_classes'],
        lstm_impl=params['lstm_impl'],
        use_peephole=params['use_peephole'],
        parameter_init=params['weight_init'],
        clip_grad_norm=params['clip_grad_norm'],
        clip_activation=params['clip_activation'],
        num_proj=params['num_proj'],
        weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_eval(model=model, params=params,
            epoch=args.epoch, beam_width=args.beam_width,
            eval_batch_size=args.eval_batch_size,
            temperature=args.temperature)

Exemplo n.º 2

0

Exibir arquivo

def main():

    args = parser.parse_args()

    # Load config file
    with open(os.path.join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type'] == 'character':
        params['num_classes'] = 28

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'] * params['num_stack'],
                splice=params['splice'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_eval(model=model,
            params=params,
            epoch=args.epoch,
            beam_width=args.beam_width,
            eval_batch_size=args.eval_batch_size,
            temperature=args.temperature)

Exemplo n.º 3

0

Exibir arquivo

def main():

    args = parser.parse_args()

    # Load config file
    with open(join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        if params['train_data_size'] == 'train100h':
            params['num_classes'] = 72
        elif params['train_data_size'] == 'train460h':
            params['num_classes'] = 77
        elif params['train_data_size'] == 'train960h':
            params['num_classes'] = 77
    elif params['label_type'] == 'word_freq10':
        if params['train_data_size'] == 'train100h':
            params['num_classes'] = 7213
        elif params['train_data_size'] == 'train460h':
            params['num_classes'] = 18641
        elif params['train_data_size'] == 'train960h':
            params['num_classes'] = 26642
    else:
        raise TypeError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_eval(model=model,
            params=params,
            epoch=args.epoch,
            eval_batch_size=args.eval_batch_size,
            beam_width=args.beam_width)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: decode_ctc.py Projeto: seasky100/tensorflow_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load config file
    with open(join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        if params['train_data_size'] == 'train100h':
            params['num_classes'] = 72
        elif params['train_data_size'] == 'train460h':
            params['num_classes'] = 77
        elif params['train_data_size'] == 'train960h':
            params['num_classes'] = 77
    elif params['label_type'] == 'word_freq10':
        if params['train_data_size'] == 'train100h':
            params['num_classes'] = 7213
        elif params['train_data_size'] == 'train460h':
            params['num_classes'] = 18641
        elif params['train_data_size'] == 'train960h':
            params['num_classes'] = 26642
    else:
        raise TypeError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size']
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_decode(model=model, params=params,
              epoch=args.epoch, beam_width=args.beam_width,
              eval_batch_size=args.eval_batch_size)

Exemplo n.º 5

0

Exibir arquivo

def main():

    args = parser.parse_args()

    # Load config file
    with open(join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type'] == 'kana':
        params['num_classes'] = 146
    elif params['label_type'] == 'kana_divide':
        params['num_classes'] = 147
    elif params['label_type'] == 'kanji':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2981
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3385
    elif params['label_type'] == 'kanji_divide':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2982
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3386
    else:
        raise TypeError

    # Modle setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_decode(model=model,
              params=params,
              epoch=args.epoch,
              beam_width=args.beam_width,
              eval_batch_size=args.eval_batch_size)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: decode_ctc.py Projeto: seasky100/tensorflow_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load config file
    with open(join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type'] == 'kana':
        params['num_classes'] = 146
    elif params['label_type'] == 'kana_divide':
        params['num_classes'] = 147
    elif params['label_type'] == 'kanji':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2981
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3385
    elif params['label_type'] == 'kanji_divide':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2982
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3386
    else:
        raise TypeError

    # Modle setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_decode(model=model, params=params,
              epoch=args.epoch, beam_width=args.beam_width,
              eval_batch_size=args.eval_batch_size)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: plot_ctc_prob.py Projeto: zz12375/tensorflow_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load config file
    with open(join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type'] == 'phone61':
        params['num_classes'] = 61
    elif params['label_type'] == 'phone48':
        params['num_classes'] = 48
    elif params['label_type'] == 'phone39':
        params['num_classes'] = 39
    elif params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        params['num_classes'] = 72
    else:
        raise ValueError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_plot(model=model,
            params=params,
            epoch=args.epoch,
            eval_batch_size=args.eval_batch_size)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: plot_ctc_prob.py Projeto: seasky100/tensorflow_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load config file
    with open(join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type'] == 'phone61':
        params['num_classes'] = 61
    elif params['label_type'] == 'phone48':
        params['num_classes'] = 48
    elif params['label_type'] == 'phone39':
        params['num_classes'] = 39
    elif params['label_type'] == 'character':
        params['num_classes'] = 28
    elif params['label_type'] == 'character_capital_divide':
        params['num_classes'] = 72
    else:
        raise ValueError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_plot(model=model, params=params,
            epoch=args.epoch, eval_batch_size=args.eval_batch_size)

Exemplo n.º 9

0

Exibir arquivo

Arquivo: decode_ctc.py Projeto: zz12375/tensorflow_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load config file
    with open(join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['ss_type'] == 'remove':
        params['num_classes'] = 147
    elif params['ss_type'] in ['insert_left', 'insert_right']:
        params['num_classes'] = 151
    elif params['ss_type'] == 'insert_both':
        params['num_classes'] = 155
    else:
        raise TypeError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_decode(model=model,
              params=params,
              epoch=args.epoch,
              beam_width=args.beam_width,
              eval_batch_size=args.eval_batch_size)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: decode_ctc.py Projeto: seasky100/tensorflow_end2end_speech_recognition

def main():

    args = parser.parse_args()

    # Load config file
    with open(join(args.model_path, 'config.yml'), "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['ss_type'] == 'remove':
        params['num_classes'] = 147
    elif params['ss_type'] in ['insert_left', 'insert_right']:
        params['num_classes'] = 151
    elif params['ss_type'] == 'insert_both':
        params['num_classes'] = 155
    else:
        raise TypeError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    model.save_path = args.model_path
    do_decode(model=model, params=params,
              epoch=args.epoch, beam_width=args.beam_width,
              eval_batch_size=args.eval_batch_size)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: train_ctc.py Projeto: zz12375/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    if params['feature'] == 'fbank':
        input_size = 123
    elif params['feature'] == 'is13':
        input_size = 141

    if params['label_type'] in ['original', 'phone3']:
        params['num_classes'] = 3
    elif params['label_type'] == 'phone4':
        params['num_classes'] = 4
    elif params['label_type'] == 'phone43':
        params['num_classes'] = 43

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=input_size * params['num_stack'],
                splice=params['splice'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('tf_svc_' + model.name + '_' + params['label_type'])

    model.name += '_' + str(params['num_units'])
    model.name += '_' + str(params['num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'ctc', params['label_type'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: test_ctc_decoder.py Projeto: seasky100/tensorflow_end2end_speech_recognition

    def check(self, decoder_type):

        print('==================================================')
        print('  decoder_type: %s' % decoder_type)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 2
            num_stack = 2
            inputs, labels, inputs_seq_len = generate_data(
                label_type='character',
                model='ctc',
                batch_size=batch_size,
                num_stack=num_stack,
                splice=1)
            max_time = inputs.shape[1]

            # Define model graph
            model = CTC(encoder_type='blstm',
                        input_size=inputs[0].shape[-1],
                        splice=1,
                        num_stack=num_stack,
                        num_units=256,
                        num_layers=2,
                        num_classes=27,
                        lstm_impl='LSTMBlockCell',
                        parameter_init=0.1,
                        clip_grad_norm=5.0,
                        clip_activation=50,
                        num_proj=256,
                        weight_decay=1e-6)

            # Define placeholders
            model.create_placeholders()

            # Add to the graph each operation
            _, logits = model.compute_loss(
                model.inputs_pl_list[0],
                model.labels_pl_list[0],
                model.inputs_seq_len_pl_list[0],
                model.keep_prob_pl_list[0])
            beam_width = 20 if 'beam_search' in decoder_type else 1
            decode_op = model.decoder(logits,
                                      model.inputs_seq_len_pl_list[0],
                                      beam_width=beam_width)
            ler_op = model.compute_ler(decode_op, model.labels_pl_list[0])
            posteriors_op = model.posteriors(logits, blank_prior=1)

            if decoder_type == 'np_greedy':
                decoder = GreedyDecoder(blank_index=model.num_classes)
            elif decoder_type == 'np_beam_search':
                decoder = BeamSearchDecoder(space_index=26,
                                            blank_index=model.num_classes - 1)

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: list2sparsetensor(labels,
                                                           padded_value=-1),
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.keep_prob_pl_list[0]: 1.0
            }

            # Create a saver for writing training checkpoints
            saver = tf.train.Saver()

            with tf.Session() as sess:
                ckpt = tf.train.get_checkpoint_state('./')

                # If check point exists
                if ckpt:
                    model_path = ckpt.model_checkpoint_path
                    saver.restore(sess, model_path)
                    print("Model restored: " + model_path)
                else:
                    raise ValueError('There are not any checkpoints.')

                if decoder_type in ['tf_greedy', 'tf_beam_search']:
                    # Decode
                    labels_pred_st = sess.run(decode_op, feed_dict=feed_dict)
                    labels_pred = sparsetensor2list(
                        labels_pred_st, batch_size=batch_size)

                    # Compute accuracy
                    cer = sess.run(ler_op, feed_dict=feed_dict)
                else:
                    # Compute CTC posteriors
                    probs = sess.run(posteriors_op, feed_dict=feed_dict)
                    probs = probs.reshape(-1, max_time, model.num_classes)

                    if decoder_type == 'np_greedy':
                        # Decode
                        labels_pred = decoder(probs=probs,
                                              seq_len=inputs_seq_len)

                    elif decoder_type == 'np_beam_search':
                        # Decode
                        labels_pred, scores = decoder(probs=probs,
                                                      seq_len=inputs_seq_len,
                                                      beam_width=beam_width)

                    # Compute accuracy
                    cer = compute_cer(str_pred=idx2alpha(labels_pred[0]),
                                      str_true=idx2alpha(labels[0]),
                                      normalize=True)

                # Visualize
                print('CER: %.3f %%' % (cer * 100))
                print('Ref: %s' % idx2alpha(labels[0]))
                print('Hyp: %s' % idx2alpha(labels_pred[0]))

Exemplo n.º 13

0

Exibir arquivo

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank class
    # TODO load vocab.txt num
    if params['label_type'] == 'character':
        params['num_classes'] = 4714
    else:
        raise TypeError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    # Set process name
    setproctitle('tf_libri_' + model.name + '_' + params['train_data_size'] +
                 '_' + params['label_type'])

    model.name += '_' + str(params['num_units'])
    model.name += '_' + str(params['num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    if params['bottleneck_dim'] != 0:
        model.name += '_bottle' + str(params['bottleneck_dim'])
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(model_save_path, 'ctc', params['label_type'],
                                 params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    #sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: test_ctc_decoder.py Projeto: zz12375/tensorflow_end2end_speech_recognition

    def check(self, decoder_type):

        print('==================================================')
        print('  decoder_type: %s' % decoder_type)
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 2
            num_stack = 2
            inputs, labels, inputs_seq_len = generate_data(
                label_type='character',
                model='ctc',
                batch_size=batch_size,
                num_stack=num_stack,
                splice=1)
            max_time = inputs.shape[1]

            # Define model graph
            model = CTC(encoder_type='blstm',
                        input_size=inputs[0].shape[-1],
                        splice=1,
                        num_stack=num_stack,
                        num_units=256,
                        num_layers=2,
                        num_classes=27,
                        lstm_impl='LSTMBlockCell',
                        parameter_init=0.1,
                        clip_grad_norm=5.0,
                        clip_activation=50,
                        num_proj=256,
                        weight_decay=1e-6)

            # Define placeholders
            model.create_placeholders()

            # Add to the graph each operation
            _, logits = model.compute_loss(model.inputs_pl_list[0],
                                           model.labels_pl_list[0],
                                           model.inputs_seq_len_pl_list[0],
                                           model.keep_prob_pl_list[0])
            beam_width = 20 if 'beam_search' in decoder_type else 1
            decode_op = model.decoder(logits,
                                      model.inputs_seq_len_pl_list[0],
                                      beam_width=beam_width)
            ler_op = model.compute_ler(decode_op, model.labels_pl_list[0])
            posteriors_op = model.posteriors(logits, blank_prior=1)

            if decoder_type == 'np_greedy':
                decoder = GreedyDecoder(blank_index=model.num_classes)
            elif decoder_type == 'np_beam_search':
                decoder = BeamSearchDecoder(space_index=26,
                                            blank_index=model.num_classes - 1)

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: list2sparsetensor(labels,
                                                           padded_value=-1),
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.keep_prob_pl_list[0]: 1.0
            }

            # Create a saver for writing training checkpoints
            saver = tf.train.Saver()

            with tf.Session() as sess:
                ckpt = tf.train.get_checkpoint_state('./')

                # If check point exists
                if ckpt:
                    model_path = ckpt.model_checkpoint_path
                    saver.restore(sess, model_path)
                    print("Model restored: " + model_path)
                else:
                    raise ValueError('There are not any checkpoints.')

                if decoder_type in ['tf_greedy', 'tf_beam_search']:
                    # Decode
                    labels_pred_st = sess.run(decode_op, feed_dict=feed_dict)
                    labels_pred = sparsetensor2list(labels_pred_st,
                                                    batch_size=batch_size)

                    # Compute accuracy
                    cer = sess.run(ler_op, feed_dict=feed_dict)
                else:
                    # Compute CTC posteriors
                    probs = sess.run(posteriors_op, feed_dict=feed_dict)
                    probs = probs.reshape(-1, max_time, model.num_classes)

                    if decoder_type == 'np_greedy':
                        # Decode
                        labels_pred = decoder(probs=probs,
                                              seq_len=inputs_seq_len)

                    elif decoder_type == 'np_beam_search':
                        # Decode
                        labels_pred, scores = decoder(probs=probs,
                                                      seq_len=inputs_seq_len,
                                                      beam_width=beam_width)

                    # Compute accuracy
                    cer = compute_cer(str_pred=idx2alpha(labels_pred[0]),
                                      str_true=idx2alpha(labels[0]),
                                      normalize=True)

                # Visualize
                print('CER: %.3f %%' % (cer * 100))
                print('Ref: %s' % idx2alpha(labels[0]))
                print('Hyp: %s' % idx2alpha(labels_pred[0]))

Exemplo n.º 15

0

Exibir arquivo

Arquivo: train_ctc.py Projeto: seasky100/tensorflow_end2end_speech_recognition

def main(config_path, model_save_path, gpu_indices):

    # Load a config file (.yml)
    with open(config_path, "r") as f:
        config = yaml.load(f)
        params = config['param']

    # Except for a blank label
    if params['label_type'] == 'kana':
        params['num_classes'] = 146
    elif params['label_type'] == 'kana_divide':
        params['num_classes'] = 147
    elif params['label_type'] == 'kanji':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2981
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3385
    elif params['label_type'] == 'kanji_divide':
        if params['train_data_size'] == 'train_subset':
            params['num_classes'] = 2982
        elif params['train_data_size'] == 'train_fullset':
            params['num_classes'] = 3386
    else:
        raise TypeError

    # Model setting
    model = CTC(encoder_type=params['encoder_type'],
                input_size=params['input_size'],
                splice=params['splice'],
                num_stack=params['num_stack'],
                num_units=params['num_units'],
                num_layers=params['num_layers'],
                num_classes=params['num_classes'],
                lstm_impl=params['lstm_impl'],
                use_peephole=params['use_peephole'],
                parameter_init=params['weight_init'],
                clip_grad_norm=params['clip_grad_norm'],
                clip_activation=params['clip_activation'],
                num_proj=params['num_proj'],
                weight_decay=params['weight_decay'])

    # Set process name
    setproctitle(
        'tf_csj_' + model.name + '_' + params['train_data_size'] + '_' + params['label_type'])

    model.name += '_' + str(params['num_units'])
    model.name += '_' + str(params['num_layers'])
    model.name += '_' + params['optimizer']
    model.name += '_lr' + str(params['learning_rate'])
    if params['num_proj'] != 0:
        model.name += '_proj' + str(params['num_proj'])
    if params['dropout'] != 0:
        model.name += '_drop' + str(params['dropout'])
    if params['num_stack'] != 1:
        model.name += '_stack' + str(params['num_stack'])
    if params['weight_decay'] != 0:
        model.name += '_wd' + str(params['weight_decay'])
    if params['bottleneck_dim'] != 0:
        model.name += '_bottle' + str(params['bottleneck_dim'])
    if len(gpu_indices) >= 2:
        model.name += '_gpu' + str(len(gpu_indices))

    # Set save path
    model.save_path = mkdir_join(
        model_save_path, 'ctc', params['label_type'],
        params['train_data_size'], model.name)

    # Reset model directory
    model_index = 0
    new_model_path = model.save_path
    while True:
        if isfile(join(new_model_path, 'complete.txt')):
            # Training of the first model have been finished
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        elif isfile(join(new_model_path, 'config.yml')):
            # Training of the first model have not been finished yet
            model_index += 1
            new_model_path = model.save_path + '_' + str(model_index)
        else:
            break
    model.save_path = mkdir(new_model_path)

    # Save config file
    shutil.copyfile(config_path, join(model.save_path, 'config.yml'))

    sys.stdout = open(join(model.save_path, 'train.log'), 'w')
    # TODO(hirofumi): change to logger
    do_train(model=model, params=params, gpu_indices=gpu_indices)

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_ctc.py Projeto: seasky100/tensorflow_end2end_speech_recognition

    def check(self, encoder_type, label_type='character',
              lstm_impl=None, time_major=True, save_params=False):

        print('==================================================')
        print('  encoder_type: %s' % encoder_type)
        print('  label_type: %s' % label_type)
        print('  lstm_impl: %s' % lstm_impl)
        print('  time_major: %s' % str(time_major))
        print('  save_params: %s' % str(save_params))
        print('==================================================')

        tf.reset_default_graph()
        with tf.Graph().as_default():
            # Load batch data
            batch_size = 2
            splice = 11 if encoder_type in ['vgg_blstm', 'vgg_lstm', 'cnn_zhang',
                                            'vgg_wang', 'resnet_wang', 'cldnn_wang'] else 1
            num_stack = 2
            inputs, labels, inputs_seq_len = generate_data(
                label_type=label_type,
                model='ctc',
                batch_size=batch_size,
                num_stack=num_stack,
                splice=splice)
            # NOTE: input_size must be even number when using CudnnLSTM

            # Define model graph
            num_classes = 27 if label_type == 'character' else 61
            model = CTC(encoder_type=encoder_type,
                        input_size=inputs[0].shape[-1] // splice // num_stack,
                        splice=splice,
                        num_stack=num_stack,
                        num_units=256,
                        num_layers=2,
                        num_classes=num_classes,
                        lstm_impl=lstm_impl,
                        parameter_init=0.1,
                        clip_grad_norm=5.0,
                        clip_activation=50,
                        num_proj=256,
                        weight_decay=1e-10,
                        # bottleneck_dim=50,
                        bottleneck_dim=None,
                        time_major=time_major)

            # Define placeholders
            model.create_placeholders()
            learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate')

            # Add to the graph each operation
            loss_op, logits = model.compute_loss(
                model.inputs_pl_list[0],
                model.labels_pl_list[0],
                model.inputs_seq_len_pl_list[0],
                model.keep_prob_pl_list[0])
            train_op = model.train(loss_op,
                                   optimizer='nestrov',
                                   learning_rate=learning_rate_pl)
            # NOTE: Adam does not run on CudnnLSTM
            decode_op = model.decoder(logits,
                                      model.inputs_seq_len_pl_list[0],
                                      beam_width=20)
            ler_op = model.compute_ler(decode_op, model.labels_pl_list[0])

            # Define learning rate controller
            learning_rate = 1e-4
            lr_controller = Controller(learning_rate_init=learning_rate,
                                       decay_start_epoch=50,
                                       decay_rate=0.9,
                                       decay_patient_epoch=10,
                                       lower_better=True)

            if save_params:
                # Create a saver for writing training checkpoints
                saver = tf.train.Saver(max_to_keep=None)

            # Add the variable initializer operation
            init_op = tf.global_variables_initializer()

            # Count total parameters
            if lstm_impl != 'CudnnLSTM':
                parameters_dict, total_parameters = count_total_parameters(
                    tf.trainable_variables())
                for parameter_name in sorted(parameters_dict.keys()):
                    print("%s %d" %
                          (parameter_name, parameters_dict[parameter_name]))
                print("Total %d variables, %s M parameters" %
                      (len(parameters_dict.keys()),
                       "{:,}".format(total_parameters / 1000000)))

            # Make feed dict
            feed_dict = {
                model.inputs_pl_list[0]: inputs,
                model.labels_pl_list[0]: list2sparsetensor(labels, padded_value=-1),
                model.inputs_seq_len_pl_list[0]: inputs_seq_len,
                model.keep_prob_pl_list[0]: 1.0,
                learning_rate_pl: learning_rate
            }

            idx2phone = Idx2phone(map_file_path='./phone61.txt')

            with tf.Session() as sess:
                # Initialize parameters
                sess.run(init_op)

                # Wrapper for tfdbg
                # sess = tf_debug.LocalCLIDebugWrapperSession(sess)

                # Train model
                max_steps = 1000
                start_time_step = time.time()
                for step in range(max_steps):

                    # for debug
                    # encoder_outputs = sess.run(
                    #     model.encoder_outputs, feed_dict)
                    # print(encoder_outputs.shape)

                    # Compute loss
                    _, loss_train = sess.run(
                        [train_op, loss_op], feed_dict=feed_dict)

                    # Gradient check
                    # grads = sess.run(model.clipped_grads,
                    #                  feed_dict=feed_dict)
                    # for grad in grads:
                    #     print(np.max(grad))

                    if (step + 1) % 10 == 0:
                        # Change to evaluation mode
                        feed_dict[model.keep_prob_pl_list[0]] = 1.0

                        # Compute accuracy
                        ler_train = sess.run(ler_op, feed_dict=feed_dict)

                        duration_step = time.time() - start_time_step
                        print('Step %d: loss = %.3f / ler = %.3f (%.3f sec) / lr = %.5f' %
                              (step + 1, loss_train, ler_train, duration_step, learning_rate))
                        start_time_step = time.time()

                        # Decode
                        labels_pred_st = sess.run(
                            decode_op, feed_dict=feed_dict)

                        # Visualize
                        try:
                            labels_pred = sparsetensor2list(
                                labels_pred_st, batch_size=batch_size)
                            if label_type == 'character':
                                print('Ref: %s' % idx2alpha(labels[0]))
                                print('Hyp: %s' % idx2alpha(labels_pred[0]))
                            else:
                                print('Ref: %s' % idx2phone(labels[0]))
                                print('Hyp: %s' % idx2phone(labels_pred[0]))

                        except IndexError:
                            if label_type == 'character':
                                print('Ref: %s' % idx2alpha(labels[0]))
                                print('Hyp: %s' % '')
                            else:
                                print('Ref: %s' % idx2phone(labels[0]))
                                print('Hyp: %s' % '')
                            # NOTE: This is for no prediction

                        if ler_train < 0.1:
                            print('Modle is Converged.')
                            if save_params:
                                # Save model (check point)
                                checkpoint_file = './model.ckpt'
                                save_path = saver.save(
                                    sess, checkpoint_file, global_step=2)
                                print("Model saved in file: %s" % save_path)
                            break

                        # Update learning rate
                        learning_rate = lr_controller.decay_lr(
                            learning_rate=learning_rate,
                            epoch=step,
                            value=ler_train)
                        feed_dict[learning_rate_pl] = learning_rate