예제 #1
0
def train_lm(clusterfile,
             job_name,
             task_index,
             ssh_command,
             expdir):

    ''' does everything for language model training

    Args:
        clusterfile: the file where all the machines in the cluster are
            specified if None, local training will be done
        job_name: one of ps or worker in the case of distributed training
        task_index: the task index in this job
        ssh_command: the command to use for ssh, if 'None' no tunnel will be
            created
        expdir: the experiments directory
    '''
    #read the database config file
    parsed_database_cfg = configparser.ConfigParser()
    parsed_database_cfg.read(expdir + '/database.cfg')
    database_cfg = dict(parsed_database_cfg.items('database'))

    #read the asr config file
    parsed_nnet_cfg = configparser.ConfigParser()
    parsed_nnet_cfg.read(expdir + '/model/lm.cfg')
    nnet_cfg = dict(parsed_nnet_cfg.items('lm'))

    #read the trainer config file
    parsed_trainer_cfg = configparser.ConfigParser()
    parsed_trainer_cfg.read(expdir + '/trainer.cfg')
    trainer_cfg = dict(parsed_trainer_cfg.items('trainer'))

    #read the decoder config file
    parsed_decoder_cfg = configparser.ConfigParser()
    parsed_decoder_cfg.read(expdir + '/model/decoder.cfg')
    decoder_cfg = dict(parsed_decoder_cfg.items('decoder'))

    #create the cluster and server
    server = create_server.create_server(
        clusterfile=clusterfile,
        job_name=job_name,
        task_index=task_index,
        expdir=expdir,
        ssh_command=ssh_command)

    #copy the alphabet to the model
    if (job_name == 'ps' and task_index == 0) or job_name == 'local':
        shutil.copyfile(os.path.join(database_cfg['train_dir'], 'alphabet'),
                        os.path.join(FLAGS.expdir, 'model', 'alphabet'))

    #the ps should just wait
    if job_name == 'ps':
        server.join()

    #create the coder
    with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid:
        alphabet = fid.read().split(' ')
    coder = target_coder.TargetCoder(alphabet)

    #read the number of utterances
    with open(os.path.join(database_cfg['train_dir'], 'numlines')) as fid:
        num_utt = int(fid.read())

    #read the maximum length
    with open(os.path.join(database_cfg['train_dir'], 'max_num_chars')) as fid:
        max_length = int(fid.read())

    #create a batch dispenser for the training data
    dispenser = batchdispenser.LmBatchDispenser(
        target_coder=coder,
        size=int(trainer_cfg['batch_size']),
        textfile=os.path.join(database_cfg['train_dir'], 'text'),
        max_length=max_length,
        num_utt=num_utt)

    #create a reader for the validation data
    if 'dev_dir' in database_cfg:

        #read the maximum length
        with open(os.path.join(database_cfg['dev_dir'],
                               'max_num_chars')) as fid:
            max_length = int(fid.read())

        #create a batch dispenser for the training data
        val_reader = text_reader.TextReader(
            textfile=os.path.join(database_cfg['dev_dir'], 'text'),
            max_length=max_length,
            coder=coder)

        val_targets = val_reader.as_dict()

    else:
        if int(trainer_cfg['valid_utt']) > 0:
            val_dispenser = dispenser.split(int(trainer_cfg['valid_utt']))
            val_reader = val_dispenser.textreader
            val_targets = val_reader.asdict()
        else:
            val_reader = None
            val_targets = None

    #encode the validation targets
    if val_targets is not None:
        for utt in val_targets:
            val_targets[utt] = dispenser.textreader.coder.encode(
                val_targets[utt])

    #create the classifier
    classifier = lm_factory.factory(
        conf=nnet_cfg,
        output_dim=coder.num_labels)

    #create the callable for the decoder
    decoder = partial(
        decoder_factory.factory,
        conf=decoder_cfg,
        classifier=classifier,
        input_dim=1,
        max_input_length=val_reader.max_length,
        coder=coder,
        expdir=expdir)

    #create the trainer
    tr = trainer_factory.factory(
        conf=trainer_cfg,
        decoder=decoder,
        classifier=classifier,
        input_dim=1,
        reconstruction_dim=1,
        dispenser=dispenser,
        val_reader=val_reader,
        val_targets=val_targets,
        expdir=expdir,
        server=server,
        task_index=task_index)

    #train the classifier
    tr.train()
예제 #2
0
파일: test.py 프로젝트: uniq10/nabu
def main(_):
    '''does everything for testing'''

    decoder_cfg_file = None

    #read the database config file
    parsed_database_cfg = configparser.ConfigParser()
    parsed_database_cfg.read(os.path.join(FLAGS.asr_expdir, 'database.cfg'))
    database_cfg = dict(parsed_database_cfg.items('database'))

    #read the features config file
    parsed_feat_cfg = configparser.ConfigParser()
    parsed_feat_cfg.read(
        os.path.join(FLAGS.asr_expdir, 'model', 'features.cfg'))
    feat_cfg = dict(parsed_feat_cfg.items('features'))

    #read the asr config file
    parsed_asr_cfg = configparser.ConfigParser()
    parsed_asr_cfg.read(os.path.join(FLAGS.asr_expdir, 'model', 'asr.cfg'))
    asr_cfg = dict(parsed_asr_cfg.items('asr'))

    #read the lm config file
    parsed_lm_cfg = configparser.ConfigParser()
    parsed_lm_cfg.read(os.path.join(FLAGS.lm_expdir, 'model', 'lm.cfg'))
    lm_cfg = dict(parsed_lm_cfg.items('lm'))

    #read the asr-lm config file
    parsed_asr_lm_cfg = configparser.ConfigParser()
    parsed_asr_lm_cfg.read('config/asr_lm.cfg')
    asr_lm_cfg = dict(parsed_asr_lm_cfg.items('asr-lm'))

    #read the decoder config file
    if decoder_cfg_file is None:
        decoder_cfg_file = os.path.join(FLAGS.asr_expdir, 'model',
                                        'decoder.cfg')
    parsed_decoder_cfg = configparser.ConfigParser()
    parsed_decoder_cfg.read(decoder_cfg_file)
    decoder_cfg = dict(parsed_decoder_cfg.items('decoder'))

    #create a feature reader
    featdir = os.path.join(database_cfg['test_dir'], feat_cfg['name'])

    with open(os.path.join(featdir, 'maxlength'), 'r') as fid:
        max_length = int(fid.read())

    reader = feature_reader.FeatureReader(
        scpfile=os.path.join(featdir, 'feats.scp'),
        cmvnfile=os.path.join(featdir, 'cmvn.scp'),
        utt2spkfile=os.path.join(featdir, 'utt2spk'),
        max_length=max_length)

    #read the feature dimension
    with open(
        os.path.join(database_cfg['train_dir'], feat_cfg['name'],
                     'dim'),
        'r') as fid:

        input_dim = int(fid.read())

    #create the coder
    with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid:
        alphabet = fid.read().split(' ')
    coder = target_coder.TargetCoder(alphabet)


    #create the classifier
    classifier = asr_lm_classifier.AsrLmClassifier(
        conf=asr_lm_cfg,
        asr_conf=asr_cfg,
        lm_conf=lm_cfg,
        output_dim=coder.num_labels)

    #create a decoder
    graph = tf.Graph()
    with graph.as_default():
        decoder = decoder_factory.factory(
            conf=decoder_cfg,
            classifier=classifier,
            input_dim=input_dim,
            max_input_length=reader.max_length,
            coder=coder,
            expdir=FLAGS.asr_expdir)


        #create the lm saver
        varnames = zip(*checkpoint_utils.list_variables(os.path.join(
            FLAGS.lm_expdir, 'model', 'network.ckpt')))[0]
        variables = [v for v in tf.all_variables()
                     if v.name.split(':')[0] in varnames]
        lm_saver = tf.train.Saver(variables)

        #create the asr saver
        varnames = zip(*checkpoint_utils.list_variables(os.path.join(
            FLAGS.asr_expdir, 'model', 'network.ckpt')))[0]
        variables = [v for v in tf.all_variables()
                     if v.name.split(':')[0] in varnames]
        asr_saver = tf.train.Saver(variables)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    config.allow_soft_placement = True

    with tf.Session(graph=graph, config=config) as sess:
        #load the lm model
        lm_saver.restore(
            sess, os.path.join(FLAGS.lm_expdir, 'model', 'network.ckpt'))

        #load the asr model
        asr_saver.restore(
            sess, os.path.join(FLAGS.asr_expdir, 'model', 'network.ckpt'))

        #decode with te neural net
        decoded = decoder.decode(reader, sess)

    #the path to the text file
    textfile = database_cfg['testtext']

    #read all the reference transcriptions
    with open(textfile) as fid:
        lines = fid.readlines()

    references = dict()
    for line in lines:
        splitline = line.strip().split(' ')
        references[splitline[0]] = ' '.join(splitline[1:])

    #compute the character error rate
    score = decoder.score(decoded, references)

    print 'score: %f' % score
예제 #3
0
def main(_):
    '''does everything for testing'''

    decoder_cfg_file = 'config/decoder/attention_visualizer.cfg'

    #read the database config file
    parsed_database_cfg = configparser.ConfigParser()
    parsed_database_cfg.read(os.path.join(FLAGS.expdir, 'database.cfg'))
    database_cfg = dict(parsed_database_cfg.items('database'))

    #read the features config file
    parsed_feat_cfg = configparser.ConfigParser()
    parsed_feat_cfg.read(os.path.join(FLAGS.expdir, 'model', 'features.cfg'))
    feat_cfg = dict(parsed_feat_cfg.items('features'))

    #read the asr config file
    parsed_nnet_cfg = configparser.ConfigParser()
    parsed_nnet_cfg.read(os.path.join(FLAGS.expdir, 'model', 'asr.cfg'))
    nnet_cfg = dict(parsed_nnet_cfg.items('asr'))

    #read the decoder config file
    if decoder_cfg_file is None:
        decoder_cfg_file = os.path.join(FLAGS.expdir, 'model', 'decoder.cfg')
    parsed_decoder_cfg = configparser.ConfigParser()
    parsed_decoder_cfg.read(decoder_cfg_file)
    decoder_cfg = dict(parsed_decoder_cfg.items('decoder'))

    #create a feature reader
    featdir = os.path.join(database_cfg['test_dir'], feat_cfg['name'])

    with open(os.path.join(featdir, 'maxlength'), 'r') as fid:
        max_length = int(fid.read())

    reader = feature_reader.FeatureReader(
        scpfile=os.path.join(featdir, 'feats.scp'),
        cmvnfile=os.path.join(featdir, 'cmvn.scp'),
        utt2spkfile=os.path.join(featdir, 'utt2spk'),
        max_length=max_length)

    #read the feature dimension
    with open(os.path.join(database_cfg['train_dir'], feat_cfg['name'], 'dim'),
              'r') as fid:

        input_dim = int(fid.read())

    #create the coder
    with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid:
        alphabet = fid.read().split(' ')
    coder = target_coder.TargetCoder(alphabet)

    #create the classifier
    classifier = asr_factory.factory(conf=nnet_cfg,
                                     output_dim=coder.num_labels)

    #create a decoder
    graph = tf.Graph()
    with graph.as_default():
        decoder = decoder_factory.factory(conf=decoder_cfg,
                                          classifier=classifier,
                                          input_dim=input_dim,
                                          max_input_length=reader.max_length,
                                          coder=coder,
                                          expdir=FLAGS.expdir)

        saver = tf.train.Saver(tf.trainable_variables())

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    config.allow_soft_placement = True

    with tf.Session(graph=graph, config=config) as sess:
        #load the model
        saver.restore(sess, os.path.join(FLAGS.expdir, 'model',
                                         'network.ckpt'))

        #decode with te neural net
        decoded = decoder.decode(reader, sess)

    #the path to the text file
    textfile = os.path.join(database_cfg['test_dir'], 'targets')

    #read all the reference transcriptions
    with open(textfile) as fid:
        lines = fid.readlines()

    references = dict()
    for line in lines:
        splitline = line.strip().split(' ')
        references[splitline[0]] = coder.encode(' '.join(splitline[1:]))

    #compute the character error rate
    score = decoder.score(decoded, references)

    print 'score: %f' % score

    #write the resulting beams to disk
    decodedir = os.path.join(FLAGS.expdir, 'decoded')
    if not os.path.isdir(decodedir):
        os.makedirs(decodedir)
    for utt in decoded:
        with open(os.path.join(decodedir, utt), 'w') as fid:
            for hypothesis in decoded[utt]:
                fid.write('%f\t%s\n' % (hypothesis[0], hypothesis[1]))
예제 #4
0
def main(_):
    '''does everything for testing'''

    decoder_cfg_file = None

    #read the database config file
    parsed_database_cfg = configparser.ConfigParser()
    parsed_database_cfg.read(os.path.join(FLAGS.expdir, 'database.cfg'))
    database_cfg = dict(parsed_database_cfg.items('database'))

    #read the asr config file
    parsed_nnet_cfg = configparser.ConfigParser()
    parsed_nnet_cfg.read(os.path.join(FLAGS.expdir, 'model', 'lm.cfg'))
    nnet_cfg = dict(parsed_nnet_cfg.items('lm'))

    #read the decoder config file
    if decoder_cfg_file is None:
        decoder_cfg_file = os.path.join(FLAGS.expdir, 'model', 'decoder.cfg')
    parsed_decoder_cfg = configparser.ConfigParser()
    parsed_decoder_cfg.read(decoder_cfg_file)
    decoder_cfg = dict(parsed_decoder_cfg.items('decoder'))

    #create the coder
    with open(os.path.join(FLAGS.expdir, 'model', 'alphabet')) as fid:
        alphabet = fid.read().split(' ')
    coder = target_coder.TargetCoder(alphabet)

    #read the maximum length
    with open(os.path.join(database_cfg['test_dir'], 'max_num_chars')) as fid:
        max_length = int(fid.read())

    #create a text reader
    textreader = text_reader.TextReader(textfile=os.path.join(
        database_cfg['test_dir'], 'text'),
                                        max_length=max_length,
                                        coder=coder)

    #create the classifier
    classifier = lm_factory.factory(conf=nnet_cfg, output_dim=coder.num_labels)

    #create a decoder
    graph = tf.Graph()
    with graph.as_default():
        decoder = decoder_factory.factory(conf=decoder_cfg,
                                          classifier=classifier,
                                          input_dim=1,
                                          max_input_length=max_length,
                                          coder=coder,
                                          expdir=FLAGS.expdir)

        saver = tf.train.Saver(tf.trainable_variables())

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True  #pylint: disable=E1101
    config.allow_soft_placement = True

    with tf.Session(graph=graph, config=config) as sess:
        #load the model
        saver.restore(sess, os.path.join(FLAGS.expdir, 'model',
                                         'network.ckpt'))

        #decode with te neural net
        decoded = decoder.decode(textreader, sess)

    #compute the character error rate
    score = decoder.score(decoded, None)

    print 'perplexity: %f' % score
예제 #5
0
def train_asr(clusterfile, job_name, task_index, ssh_command, expdir):
    ''' does everything for asr training

    Args:
        clusterfile: the file where all the machines in the cluster are
            specified if None, local training will be done
        job_name: one of ps or worker in the case of distributed training
        task_index: the task index in this job
        ssh_command: the command to use for ssh, if 'None' no tunnel will be
            created
        expdir: the experiments directory
    '''

    #read the database config file
    parsed_database_cfg = configparser.ConfigParser()
    parsed_database_cfg.read(os.path.join(expdir, 'database.cfg'))
    database_cfg = dict(parsed_database_cfg.items('database'))

    #read the features config file
    parsed_feat_cfg = configparser.ConfigParser()
    parsed_feat_cfg.read(os.path.join(expdir, 'model', 'features.cfg'))
    feat_cfg = dict(parsed_feat_cfg.items('features'))

    #read the asr config file
    parsed_nnet_cfg = configparser.ConfigParser()
    parsed_nnet_cfg.read(os.path.join(expdir, 'model', 'asr.cfg'))
    nnet_cfg = dict(parsed_nnet_cfg.items('asr'))

    #read the trainer config file
    parsed_trainer_cfg = configparser.ConfigParser()
    parsed_trainer_cfg.read(os.path.join(expdir, 'trainer.cfg'))
    trainer_cfg = dict(parsed_trainer_cfg.items('trainer'))

    #read the decoder config file
    parsed_decoder_cfg = configparser.ConfigParser()
    parsed_decoder_cfg.read(os.path.join(expdir, 'model', 'decoder.cfg'))
    decoder_cfg = dict(parsed_decoder_cfg.items('decoder'))

    #create the cluster and server
    server = create_server.create_server(clusterfile=clusterfile,
                                         job_name=job_name,
                                         task_index=task_index,
                                         expdir=expdir,
                                         ssh_command=ssh_command)

    #the ps should just wait
    if job_name == 'ps':
        server.join()

    featdir = os.path.join(database_cfg['train_dir'], feat_cfg['name'])

    #create the coder
    with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid:
        alphabet = fid.read().split(' ')
    coder = target_coder.TargetCoder(alphabet)

    #create a feature reader for the training data
    with open(featdir + '/maxlength', 'r') as fid:
        max_length = int(fid.read())

    featreader = feature_reader.FeatureReader(scpfile=featdir +
                                              '/feats_shuffled.scp',
                                              cmvnfile=featdir + '/cmvn.scp',
                                              utt2spkfile=featdir + '/utt2spk',
                                              max_length=max_length)

    #read the feature dimension
    with open(featdir + '/dim', 'r') as fid:
        input_dim = int(fid.read())

    #the path to the text file
    textfile = os.path.join(database_cfg['train_dir'], 'targets')

    #create a batch dispenser for the training data
    dispenser = batchdispenser.AsrBatchDispenser(
        feature_reader=featreader,
        target_coder=coder,
        size=int(trainer_cfg['batch_size']),
        target_path=textfile)

    #create a reader for the validation data
    if 'dev_data' in database_cfg:
        featdir = database_cfg['dev_dir'] + '/' + feat_cfg['name']

        with open(featdir + '/maxlength', 'r') as fid:
            max_length = int(fid.read())

        val_reader = feature_reader.FeatureReader(
            scpfile=featdir + '/feats.scp',
            cmvnfile=featdir + '/cmvn.scp',
            utt2spkfile=featdir + '/utt2spk',
            max_length=max_length)

        textfile = os.path.join(database_cfg['dev_dir'], 'targets')

        #read the validation targets
        with open(textfile) as fid:
            lines = fid.readlines()

        val_targets = dict()
        for line in lines:
            splitline = line.strip().split(' ')
            val_targets[splitline[0]] = ' '.join(splitline[1:])

    else:
        if int(trainer_cfg['valid_utt']) > 0:
            val_dispenser = dispenser.split(int(trainer_cfg['valid_utt']))
            val_reader = val_dispenser.feature_reader
            val_targets = val_dispenser.target_dict
        else:
            val_reader = None
            val_targets = None

    #encode the validation targets
    if val_targets is not None:
        for utt in val_targets:
            val_targets[utt] = dispenser.target_coder.encode(val_targets[utt])

    #create the classifier
    classifier = asr_factory.factory(conf=nnet_cfg,
                                     output_dim=coder.num_labels)

    #create the callable for the decoder
    decoder = partial(decoder_factory.factory,
                      conf=decoder_cfg,
                      classifier=classifier,
                      input_dim=input_dim,
                      max_input_length=val_reader.max_length,
                      coder=coder,
                      expdir=expdir)

    #create the trainer
    tr = trainer_factory.factory(conf=trainer_cfg,
                                 decoder=decoder,
                                 classifier=classifier,
                                 input_dim=input_dim,
                                 dispenser=dispenser,
                                 val_reader=val_reader,
                                 val_targets=val_targets,
                                 expdir=expdir,
                                 server=server,
                                 task_index=task_index)

    print 'starting training'

    #train the classifier
    tr.train()
예제 #6
0
def train_asr(clusterfile,
              job_name,
              task_index,
              ssh_command,
              expdir):

    ''' does everything for asr training
    Args:
        clusterfile: the file where all the machines in the cluster are
            specified if None, local training will be done
        job_name: one of ps or worker in the case of distributed training
        task_index: the task index in this job
        ssh_command: the command to use for ssh, if 'None' no tunnel will be
            created
        expdir: the experiments directory
    '''

    #read the database config file
    parsed_database_cfg = configparser.ConfigParser()
    parsed_database_cfg.read(os.path.join(expdir, 'database.cfg'))
    database_cfg = dict(parsed_database_cfg.items('database'))

    #read the features config file
    parsed_feat_cfg = configparser.ConfigParser()
    parsed_feat_cfg.read(os.path.join(expdir, 'model', 'features.cfg'))
    feat_cfg = dict(parsed_feat_cfg.items('features'))

    #read the asr config file
    parsed_nnet_cfg = configparser.ConfigParser()
    parsed_nnet_cfg.read(os.path.join(expdir, 'model', 'asr.cfg'))
    nnet_cfg = dict(parsed_nnet_cfg.items('asr'))

    #read the trainer config file
    parsed_trainer_cfg = configparser.ConfigParser()
    parsed_trainer_cfg.read(os.path.join(expdir, 'trainer.cfg'))
    trainer_cfg = dict(parsed_trainer_cfg.items('trainer'))

    #read the decoder config file
    parsed_decoder_cfg = configparser.ConfigParser()
    parsed_decoder_cfg.read(os.path.join(expdir, 'model', 'decoder.cfg'))
    decoder_cfg = dict(parsed_decoder_cfg.items('decoder'))

    #make distinction between three implemented different kind of training forms
    if database_cfg['train_mode'] == 'supervised':
        nonsupervised = False
    elif database_cfg['train_mode'] == 'nonsupervised' or\
            database_cfg['train_mode'] == 'semisupervised':
        nonsupervised = True
    else:
        raise Exception('Wrong kind of training mode')

    #when (partly) nonsupervised, what features are used for the reconstruction
    #currently two possible options implemented
    if nonsupervised:
        if trainer_cfg['reconstruction_features'] == 'audio_samples':
            audio_used = True
        elif trainer_cfg['reconstruction_features'] == 'input_features':
            audio_used = False
        else:
            raise Exception(
                'Unknown specification for the reconstruction features')

    #read the quant config file if nonsupervised training and samples used
    if nonsupervised:
        if audio_used:
            parsed_quant_cfg = configparser.ConfigParser()
            parsed_quant_cfg.read(os.path.join(expdir,
                                               'model', 'quantization.cfg'))
            quant_cfg = dict(parsed_quant_cfg.items('features'))

    #based on the other settings, compute and overwrite samples_per_hlfeature
    #and unpredictable_samples in the classifier config dictionary
    if nonsupervised:
        if audio_used:
            rate_after_quant = int(quant_cfg['quant_rate'])
            win_lenght = float(feat_cfg['winlen'])
            win_shift = float(feat_cfg['winstep'])
            samples_one_window = int(win_lenght*rate_after_quant)
            samples_one_shift = int(win_shift*rate_after_quant)
            #### THIS IS ONLY RELEVANT WHEN USING A LISTENER WITH PYRAM STRUCT
            # and this line should be adapted otherwise
            time_compression = 2**int(nnet_cfg['listener_numlayers'])
            #store values in config dictionary
            nnet_cfg['samples_per_hlfeature'] = samples_one_shift\
                                        *time_compression
            nnet_cfg['unpredictable_samples'] = (samples_one_window+\
                                    (time_compression-1)\
                        *samples_one_shift)-nnet_cfg['samples_per_hlfeature']


    #create the cluster and server
    server = create_server.create_server(
        clusterfile=clusterfile,
        job_name=job_name,
        task_index=task_index,
        expdir=expdir,
        ssh_command=ssh_command)

    #the ps should just wait
    if job_name == 'ps':
        server.join()

    # path to where the training samples are stored
    featdir = os.path.join(database_cfg['train_dir'], feat_cfg['name'])

    #create the coder
    with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid:
        alphabet = fid.read().split(' ')
    coder = target_coder.TargetCoder(alphabet)

    #create a feature reader for the training data
    with open(featdir + '/maxlength', 'r') as fid:
        max_length = int(fid.read())

    featreader = feature_reader.FeatureReader(
        scpfile=featdir + '/feats_shuffled.scp',
        cmvnfile=featdir + '/cmvn.scp',
        utt2spkfile=featdir + '/utt2spk',
        max_length=max_length)

    #read the feature dimension
    with open(featdir + '/dim', 'r') as fid:
        input_dim = int(fid.read())

    #the path to the text file
    textfile = os.path.join(database_cfg['train_dir'], 'targets')


    # If nonsupervised and audio used, we also need to read samples
    # these can be done with a second feature reader
    if nonsupervised:
        if audio_used:
            featdir2 = os.path.join(database_cfg['train_dir'],
                                    quant_cfg['name'])

            with open(featdir2 + '/maxlength', 'r') as fid:
                max_length_audio = int(fid.read())

            audioreader = feature_reader.FeatureReader(
                scpfile=featdir2 + '/feats_shuffled.scp',
                cmvnfile=None,
                utt2spkfile=None,
                max_length=max_length_audio)

    ## create a batch dispenser, depending on which situation we're in
    if not nonsupervised:
    # in the normal supervised training mode, regular dispenser is needed
        if 'las_ignoring_mode' in trainer_cfg:
            if trainer_cfg['las_ignoring_mode'] == 'True':
            # if we ignore unlabeled examples
                dispenser = batchdispenser.AsrTextBatchDispenser(
                    feature_reader=featreader,
                    target_coder=coder,
                    size=int(trainer_cfg['batch_size']),
                    target_path=textfile)
            elif trainer_cfg['las_ignoring_mode'] == 'False':
            # if we choose to process the unlabeled examples
                if 'fixed_ratio' in trainer_cfg:
                    if trainer_cfg['fixed_ratio'] == 'True':
                    # if we choose to process with batches with fixed
                    # labeled/unlabeled ratio
                        dispenser = \
                            batchdispenser.AsrTextBatchDispenserAltFixRatio(
                                feature_reader=featreader,
                                target_coder=coder,
                                size=int(trainer_cfg['batch_size']),
                                target_path=textfile,
                                percentage_unlabeled=1-float(
                                    database_cfg['part_labeled']))
                    elif trainer_cfg['fixed_ratio'] == 'False':
                        # if the fixed ratio is not used
                        dispenser = batchdispenser.AsrTextBatchDispenserAlt(
                            feature_reader=featreader,
                            target_coder=coder,
                            size=int(trainer_cfg['batch_size']),
                            target_path=textfile)
                    else:
                        raise Exception('wrong information in fixed_ratio var')
                else:
                # if fixed ratio is not specified, we choose to do without it
                    dispenser = batchdispenser.AsrTextBatchDispenserAlt(
                        feature_reader=featreader,
                        target_coder=coder,
                        size=int(trainer_cfg['batch_size']),
                        target_path=textfile)
            else:
                raise Exception('wrong information in LAS_ignoring_mode var')
        else:
        # if no specification is made about the ignoring, ignore the unlabeled
            dispenser = batchdispenser.AsrTextBatchDispenser(
                feature_reader=featreader,
                target_coder=coder,
                size=int(trainer_cfg['batch_size']),
                target_path=textfile)
    else:
        # when doing (partly) nonsupervised extra reconstruction features needed
        if audio_used:
        # when the audio is the reconstruction feature
            if 'fixed_ratio' in trainer_cfg:
                if trainer_cfg['fixed_ratio'] == 'True':
                    # if specified to work with fixed lab/unlab ratio batches
                    dispenser = \
                        batchdispenser.AsrTextAndAudioBatchDispenserFixRatio(
                            feature_reader=featreader,
                            audio_reader=audioreader,
                            target_coder=coder,
                            size=int(trainer_cfg['batch_size']),
                            target_path=textfile,
                            percentage_unlabeled=1-float(
                                database_cfg['part_labeled']))
                elif trainer_cfg['fixed_ratio'] == 'False':
                # if specified to not use the fixed ratio
                    dispenser = batchdispenser.AsrTextAndAudioBatchDispenser(
                        feature_reader=featreader,
                        audio_reader=audioreader,
                        target_coder=coder,
                        size=int(trainer_cfg['batch_size']),
                        target_path=textfile)
                else:
                    raise Exception('wrong information in fixed_ratio var')
            else:
            # without specification, suppose no fixed ratio batches
                dispenser = batchdispenser.AsrTextAndAudioBatchDispenser(
                    feature_reader=featreader,
                    audio_reader=audioreader,
                    target_coder=coder,
                    size=int(trainer_cfg['batch_size']),
                    target_path=textfile)
        else:
        # if no audio is used, the input features are used
            if 'fixed_ratio' in trainer_cfg:
                if trainer_cfg['fixed_ratio'] == 'True':
                # if specified to work with fixed labeled/unlabled ratio batches
                    dispenser = \
                        batchdispenser.AsrTextAndFeatBatchDispenserFixRatio(
                            feature_reader=featreader,
                            target_coder=coder,
                            size=int(trainer_cfg['batch_size']),
                            target_path=textfile,
                            percentage_unlabeled=1-float(
                                database_cfg['part_labeled']))
                elif trainer_cfg['fixed_ratio'] == 'False':
                # if specified to not use the fixed ratio
                    dispenser = batchdispenser.AsrTextAndFeatBatchDispenser(
                        feature_reader=featreader,
                        target_coder=coder,
                        size=int(trainer_cfg['batch_size']),
                        target_path=textfile)
                else:
                    raise Exception('wrong information in fixed_ratio var')
            else:
            # without specification, suppose no fixed ratio batches
                dispenser = batchdispenser.AsrTextAndFeatBatchDispenser(
                    feature_reader=featreader,
                    target_coder=coder,
                    size=int(trainer_cfg['batch_size']),
                    target_path=textfile)


    # read validation data. If there are text targets, they are only important
    # for the validation data. If only nonsupervised, we must validate on the
    # reconstructed features
    if 'dev_data' in database_cfg:
        # create a reader for the validation inputs
        featdir = database_cfg['dev_dir'] + '/' +  feat_cfg['name']

        with open(featdir + '/maxlength', 'r') as fid:
            max_length = int(fid.read())

        val_reader = feature_reader.FeatureReader(
            scpfile=featdir + '/feats.scp',
            cmvnfile=featdir + '/cmvn.scp',
            utt2spkfile=featdir + '/utt2spk',
            max_length=max_length)

        textfile = os.path.join(database_cfg['dev_dir'], 'targets')

        #read the validation text targets
        with open(textfile) as fid:
            lines = fid.readlines()

            val_text_targets = dict()
            for line in lines:
                splitline = line.strip().split(' ')
                val_text_targets[splitline[0]] = ' '.join(splitline[1:])

        if nonsupervised:
        #also store the reconstruction targets
            val_rec_targets = dict()
            if audio_used:
                audiodir = database_cfg['dev_dir'] + '/' +  quant_cfg['name']
                with open(audiodir + '/maxlength', 'r') as fid:
                    max_length_audio = int(fid.read())
                val_audio_reader = feature_reader.FeatureReader(
                    scpfile=audiodir + '/feats.scp',
                    cmvnfile=None,
                    utt2spkfile=audiodir + '/utt2spk',
                    max_length=max_length_audio)
                for _ in range(val_audio_reader.num_utt):
                    utt_id, audio, _ = val_audio_reader.get_utt()
                    val_rec_targets[utt_id] = audio
            else: #input features are used
                for _ in range(val_reader.num_utt):
                    utt_id, feat, _ = val_reader.get_utt()
                    val_rec_targets[utt_id] = feat
        else:
            with open(textfile) as fid:
                lines = fid.readlines()

                val_rec_targets = dict()
                for line in lines:
                    splitline = line.strip().split(' ')
                    val_rec_targets[splitline[0]] = None

        val_targets = dict()
        for utt_id in val_text_targets:
            val_targets[utt_id] = (val_text_targets[utt_id],
                                   val_rec_targets[utt_id])

    else:
        if int(trainer_cfg['valid_utt']) > 0:
            val_dispenser = dispenser.split(int(trainer_cfg['valid_utt']))
            val_reader = val_dispenser.feature_reader
            val_targets = val_dispenser.target_dict
        else:
            val_reader = None
            val_targets = None

    #encode the validation targets
    if val_targets is not None:
        for utt in val_targets:
            val_targets[utt] = (dispenser.target_coder.encode(
                val_targets[utt][0]), val_targets[utt][1])


    #create the classifier
    if nonsupervised:
        if audio_used:
            output_dim_second_el = int(quant_cfg['quant_levels'])
        else: # input features used
            output_dim_second_el = input_dim
    else: # only supervised training
        output_dim_second_el = None

    classifier = asr_factory.factory(
        conf=nnet_cfg,
        output_dim=(coder.num_labels, output_dim_second_el))

    #create the callable for the decoder
    decoder = partial(
        decoder_factory.factory,
        conf=decoder_cfg,
        classifier=classifier,
        input_dim=input_dim,
        max_input_length=val_reader.max_length,
        coder=coder,
        expdir=expdir)

    #create the trainer
    if nonsupervised:
        if audio_used:
            reconstruction_dim = 1
        else:
            reconstruction_dim = input_dim
    else:
        reconstruction_dim = 1

    tr = trainer_factory.factory(
        conf=trainer_cfg,
        decoder=decoder,
        classifier=classifier,
        input_dim=input_dim,
        reconstruction_dim=reconstruction_dim,
        dispenser=dispenser,
        val_reader=val_reader,
        val_targets=val_targets,
        expdir=expdir,
        server=server,
        task_index=task_index)

    print 'starting training'

    #train the classifier
    tr.train()
예제 #7
0
def main(_):
    '''does everything for testing'''

    #decoder_cfg_file = 'config/decoder/attention_visualizer.cfg'
    decoder_cfg_file = None

    #read the database config file
    parsed_database_cfg = configparser.ConfigParser()
    parsed_database_cfg.read(os.path.join(FLAGS.expdir, 'database.cfg'))
    database_cfg = dict(parsed_database_cfg.items('database'))

    # check the training mode
    if database_cfg['train_mode'] == 'supervised':
        nonsupervised = False
    elif database_cfg['train_mode'] == 'semisupervised':
        nonsupervised = True
    elif database_cfg['train_mode'] == 'nonsupervised':
        raise Exception('Purely nonsupervised models should be tested with \
                            the test_reconstruction file.')
    else:
        raise Exception('Wrong kind of training mode')

    #read the features config file
    parsed_feat_cfg = configparser.ConfigParser()
    parsed_feat_cfg.read(os.path.join(FLAGS.expdir, 'model', 'features.cfg'))
    feat_cfg = dict(parsed_feat_cfg.items('features'))

    #read the asr config file
    parsed_nnet_cfg = configparser.ConfigParser()
    parsed_nnet_cfg.read(os.path.join(FLAGS.expdir, 'model', 'asr.cfg'))
    nnet_cfg = dict(parsed_nnet_cfg.items('asr'))

    # read the trainer config file
    parsed_trainer_cfg = configparser.ConfigParser()
    parsed_trainer_cfg.read(os.path.join(FLAGS.expdir, 'trainer.cfg'))
    trainer_cfg = dict(parsed_trainer_cfg.items('trainer'))

    #read the decoder config file
    if decoder_cfg_file is None:
        decoder_cfg_file = os.path.join(FLAGS.expdir, 'model', 'decoder.cfg')
    parsed_decoder_cfg = configparser.ConfigParser()
    parsed_decoder_cfg.read(decoder_cfg_file)
    decoder_cfg = dict(parsed_decoder_cfg.items('decoder'))

    # if (partly) nonsupervised, check what kind reconstruction features used
    # for now two options are implemented
    if nonsupervised:
        if trainer_cfg['reconstruction_features'] == 'audio_samples':
            audio_used = True
        else:
            audio_used = False

    if nonsupervised:
        if audio_used:
            #read the quantization config file if necessary
            parsed_quant_cfg = configparser.ConfigParser()
            parsed_quant_cfg.read(os.path.join(FLAGS.expdir, 'model',
                                               'quantization.cfg'))
            quant_cfg = dict(parsed_quant_cfg.items('features'))

    #create a feature reader
    featdir = os.path.join(database_cfg['test_dir'], feat_cfg['name'])

    with open(os.path.join(featdir, 'maxlength'), 'r') as fid:
        max_length = int(fid.read())

    reader = feature_reader.FeatureReader(
        scpfile=os.path.join(featdir, 'feats.scp'),
        cmvnfile=os.path.join(featdir, 'cmvn.scp'),
        utt2spkfile=os.path.join(featdir, 'utt2spk'),
        max_length=max_length)

    #read the feature dimension
    with open(
        os.path.join(database_cfg['train_dir'], feat_cfg['name'],
                     'dim'),
        'r') as fid:

        input_dim = int(fid.read())

    #create the coder
    with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid:
        alphabet = fid.read().split(' ')
    coder = target_coder.TargetCoder(alphabet)

    #create the classifier
    if not nonsupervised:
        outputdim2 = 1
    else:
        if audio_used:
            outputdim2 = int(quant_cfg['quant_levels'])
        else: #then input features used
            outputdim2 = input_dim

    classifier = asr_factory.factory(
        conf=nnet_cfg,
        output_dim=(coder.num_labels, outputdim2))

    #create a decoder
    graph = tf.Graph()
    with graph.as_default():
        decoder = decoder_factory.factory(
            conf=decoder_cfg,
            classifier=classifier,
            input_dim=input_dim,
            max_input_length=reader.max_length,
            coder=coder,
            expdir=FLAGS.expdir)

        saver = tf.train.Saver(tf.trainable_variables())


    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True #pylint: disable=E1101
    config.allow_soft_placement = True

    with tf.Session(graph=graph, config=config) as sess:
        #load the model
        saver.restore(sess, os.path.join(FLAGS.expdir, 'model', 'network.ckpt'))

        #decode with te neural net
        decoded = decoder.decode(reader, sess)

    #the path to the text file
    textfile = database_cfg['testtext']

    #read all the reference transcriptions
    with open(textfile) as fid:
        lines = fid.readlines()

    references = dict()
    for line in lines:
        splitline = line.strip().split(' ')
        references[splitline[0]] = coder.encode(' '.join(splitline[1:]))

    #compute the character error rate
    score = decoder.score(decoded, references)

    print 'score: %f' % score

    #write the resulting beams to disk
    decodedir = os.path.join(FLAGS.expdir, 'decoded')
    if not os.path.isdir(decodedir):
        os.makedirs(decodedir)
    for utt in decoded:
        with open(os.path.join(decodedir, utt), 'w') as fid:
            for hypothesis in decoded[utt]:
                fid.write('%f\t%s\n' % (hypothesis[0], hypothesis[1]))