def train_lm(clusterfile, job_name, task_index, ssh_command, expdir): ''' does everything for language model training Args: clusterfile: the file where all the machines in the cluster are specified if None, local training will be done job_name: one of ps or worker in the case of distributed training task_index: the task index in this job ssh_command: the command to use for ssh, if 'None' no tunnel will be created expdir: the experiments directory ''' #read the database config file parsed_database_cfg = configparser.ConfigParser() parsed_database_cfg.read(expdir + '/database.cfg') database_cfg = dict(parsed_database_cfg.items('database')) #read the asr config file parsed_nnet_cfg = configparser.ConfigParser() parsed_nnet_cfg.read(expdir + '/model/lm.cfg') nnet_cfg = dict(parsed_nnet_cfg.items('lm')) #read the trainer config file parsed_trainer_cfg = configparser.ConfigParser() parsed_trainer_cfg.read(expdir + '/trainer.cfg') trainer_cfg = dict(parsed_trainer_cfg.items('trainer')) #read the decoder config file parsed_decoder_cfg = configparser.ConfigParser() parsed_decoder_cfg.read(expdir + '/model/decoder.cfg') decoder_cfg = dict(parsed_decoder_cfg.items('decoder')) #create the cluster and server server = create_server.create_server( clusterfile=clusterfile, job_name=job_name, task_index=task_index, expdir=expdir, ssh_command=ssh_command) #copy the alphabet to the model if (job_name == 'ps' and task_index == 0) or job_name == 'local': shutil.copyfile(os.path.join(database_cfg['train_dir'], 'alphabet'), os.path.join(FLAGS.expdir, 'model', 'alphabet')) #the ps should just wait if job_name == 'ps': server.join() #create the coder with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid: alphabet = fid.read().split(' ') coder = target_coder.TargetCoder(alphabet) #read the number of utterances with open(os.path.join(database_cfg['train_dir'], 'numlines')) as fid: num_utt = int(fid.read()) #read the maximum length with open(os.path.join(database_cfg['train_dir'], 'max_num_chars')) as fid: max_length = int(fid.read()) #create a batch dispenser for the training data dispenser = batchdispenser.LmBatchDispenser( target_coder=coder, size=int(trainer_cfg['batch_size']), textfile=os.path.join(database_cfg['train_dir'], 'text'), max_length=max_length, num_utt=num_utt) #create a reader for the validation data if 'dev_dir' in database_cfg: #read the maximum length with open(os.path.join(database_cfg['dev_dir'], 'max_num_chars')) as fid: max_length = int(fid.read()) #create a batch dispenser for the training data val_reader = text_reader.TextReader( textfile=os.path.join(database_cfg['dev_dir'], 'text'), max_length=max_length, coder=coder) val_targets = val_reader.as_dict() else: if int(trainer_cfg['valid_utt']) > 0: val_dispenser = dispenser.split(int(trainer_cfg['valid_utt'])) val_reader = val_dispenser.textreader val_targets = val_reader.asdict() else: val_reader = None val_targets = None #encode the validation targets if val_targets is not None: for utt in val_targets: val_targets[utt] = dispenser.textreader.coder.encode( val_targets[utt]) #create the classifier classifier = lm_factory.factory( conf=nnet_cfg, output_dim=coder.num_labels) #create the callable for the decoder decoder = partial( decoder_factory.factory, conf=decoder_cfg, classifier=classifier, input_dim=1, max_input_length=val_reader.max_length, coder=coder, expdir=expdir) #create the trainer tr = trainer_factory.factory( conf=trainer_cfg, decoder=decoder, classifier=classifier, input_dim=1, reconstruction_dim=1, dispenser=dispenser, val_reader=val_reader, val_targets=val_targets, expdir=expdir, server=server, task_index=task_index) #train the classifier tr.train()
def main(_): '''does everything for testing''' decoder_cfg_file = None #read the database config file parsed_database_cfg = configparser.ConfigParser() parsed_database_cfg.read(os.path.join(FLAGS.asr_expdir, 'database.cfg')) database_cfg = dict(parsed_database_cfg.items('database')) #read the features config file parsed_feat_cfg = configparser.ConfigParser() parsed_feat_cfg.read( os.path.join(FLAGS.asr_expdir, 'model', 'features.cfg')) feat_cfg = dict(parsed_feat_cfg.items('features')) #read the asr config file parsed_asr_cfg = configparser.ConfigParser() parsed_asr_cfg.read(os.path.join(FLAGS.asr_expdir, 'model', 'asr.cfg')) asr_cfg = dict(parsed_asr_cfg.items('asr')) #read the lm config file parsed_lm_cfg = configparser.ConfigParser() parsed_lm_cfg.read(os.path.join(FLAGS.lm_expdir, 'model', 'lm.cfg')) lm_cfg = dict(parsed_lm_cfg.items('lm')) #read the asr-lm config file parsed_asr_lm_cfg = configparser.ConfigParser() parsed_asr_lm_cfg.read('config/asr_lm.cfg') asr_lm_cfg = dict(parsed_asr_lm_cfg.items('asr-lm')) #read the decoder config file if decoder_cfg_file is None: decoder_cfg_file = os.path.join(FLAGS.asr_expdir, 'model', 'decoder.cfg') parsed_decoder_cfg = configparser.ConfigParser() parsed_decoder_cfg.read(decoder_cfg_file) decoder_cfg = dict(parsed_decoder_cfg.items('decoder')) #create a feature reader featdir = os.path.join(database_cfg['test_dir'], feat_cfg['name']) with open(os.path.join(featdir, 'maxlength'), 'r') as fid: max_length = int(fid.read()) reader = feature_reader.FeatureReader( scpfile=os.path.join(featdir, 'feats.scp'), cmvnfile=os.path.join(featdir, 'cmvn.scp'), utt2spkfile=os.path.join(featdir, 'utt2spk'), max_length=max_length) #read the feature dimension with open( os.path.join(database_cfg['train_dir'], feat_cfg['name'], 'dim'), 'r') as fid: input_dim = int(fid.read()) #create the coder with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid: alphabet = fid.read().split(' ') coder = target_coder.TargetCoder(alphabet) #create the classifier classifier = asr_lm_classifier.AsrLmClassifier( conf=asr_lm_cfg, asr_conf=asr_cfg, lm_conf=lm_cfg, output_dim=coder.num_labels) #create a decoder graph = tf.Graph() with graph.as_default(): decoder = decoder_factory.factory( conf=decoder_cfg, classifier=classifier, input_dim=input_dim, max_input_length=reader.max_length, coder=coder, expdir=FLAGS.asr_expdir) #create the lm saver varnames = zip(*checkpoint_utils.list_variables(os.path.join( FLAGS.lm_expdir, 'model', 'network.ckpt')))[0] variables = [v for v in tf.all_variables() if v.name.split(':')[0] in varnames] lm_saver = tf.train.Saver(variables) #create the asr saver varnames = zip(*checkpoint_utils.list_variables(os.path.join( FLAGS.asr_expdir, 'model', 'network.ckpt')))[0] variables = [v for v in tf.all_variables() if v.name.split(':')[0] in varnames] asr_saver = tf.train.Saver(variables) config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 config.allow_soft_placement = True with tf.Session(graph=graph, config=config) as sess: #load the lm model lm_saver.restore( sess, os.path.join(FLAGS.lm_expdir, 'model', 'network.ckpt')) #load the asr model asr_saver.restore( sess, os.path.join(FLAGS.asr_expdir, 'model', 'network.ckpt')) #decode with te neural net decoded = decoder.decode(reader, sess) #the path to the text file textfile = database_cfg['testtext'] #read all the reference transcriptions with open(textfile) as fid: lines = fid.readlines() references = dict() for line in lines: splitline = line.strip().split(' ') references[splitline[0]] = ' '.join(splitline[1:]) #compute the character error rate score = decoder.score(decoded, references) print 'score: %f' % score
def main(_): '''does everything for testing''' decoder_cfg_file = 'config/decoder/attention_visualizer.cfg' #read the database config file parsed_database_cfg = configparser.ConfigParser() parsed_database_cfg.read(os.path.join(FLAGS.expdir, 'database.cfg')) database_cfg = dict(parsed_database_cfg.items('database')) #read the features config file parsed_feat_cfg = configparser.ConfigParser() parsed_feat_cfg.read(os.path.join(FLAGS.expdir, 'model', 'features.cfg')) feat_cfg = dict(parsed_feat_cfg.items('features')) #read the asr config file parsed_nnet_cfg = configparser.ConfigParser() parsed_nnet_cfg.read(os.path.join(FLAGS.expdir, 'model', 'asr.cfg')) nnet_cfg = dict(parsed_nnet_cfg.items('asr')) #read the decoder config file if decoder_cfg_file is None: decoder_cfg_file = os.path.join(FLAGS.expdir, 'model', 'decoder.cfg') parsed_decoder_cfg = configparser.ConfigParser() parsed_decoder_cfg.read(decoder_cfg_file) decoder_cfg = dict(parsed_decoder_cfg.items('decoder')) #create a feature reader featdir = os.path.join(database_cfg['test_dir'], feat_cfg['name']) with open(os.path.join(featdir, 'maxlength'), 'r') as fid: max_length = int(fid.read()) reader = feature_reader.FeatureReader( scpfile=os.path.join(featdir, 'feats.scp'), cmvnfile=os.path.join(featdir, 'cmvn.scp'), utt2spkfile=os.path.join(featdir, 'utt2spk'), max_length=max_length) #read the feature dimension with open(os.path.join(database_cfg['train_dir'], feat_cfg['name'], 'dim'), 'r') as fid: input_dim = int(fid.read()) #create the coder with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid: alphabet = fid.read().split(' ') coder = target_coder.TargetCoder(alphabet) #create the classifier classifier = asr_factory.factory(conf=nnet_cfg, output_dim=coder.num_labels) #create a decoder graph = tf.Graph() with graph.as_default(): decoder = decoder_factory.factory(conf=decoder_cfg, classifier=classifier, input_dim=input_dim, max_input_length=reader.max_length, coder=coder, expdir=FLAGS.expdir) saver = tf.train.Saver(tf.trainable_variables()) config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 config.allow_soft_placement = True with tf.Session(graph=graph, config=config) as sess: #load the model saver.restore(sess, os.path.join(FLAGS.expdir, 'model', 'network.ckpt')) #decode with te neural net decoded = decoder.decode(reader, sess) #the path to the text file textfile = os.path.join(database_cfg['test_dir'], 'targets') #read all the reference transcriptions with open(textfile) as fid: lines = fid.readlines() references = dict() for line in lines: splitline = line.strip().split(' ') references[splitline[0]] = coder.encode(' '.join(splitline[1:])) #compute the character error rate score = decoder.score(decoded, references) print 'score: %f' % score #write the resulting beams to disk decodedir = os.path.join(FLAGS.expdir, 'decoded') if not os.path.isdir(decodedir): os.makedirs(decodedir) for utt in decoded: with open(os.path.join(decodedir, utt), 'w') as fid: for hypothesis in decoded[utt]: fid.write('%f\t%s\n' % (hypothesis[0], hypothesis[1]))
def main(_): '''does everything for testing''' decoder_cfg_file = None #read the database config file parsed_database_cfg = configparser.ConfigParser() parsed_database_cfg.read(os.path.join(FLAGS.expdir, 'database.cfg')) database_cfg = dict(parsed_database_cfg.items('database')) #read the asr config file parsed_nnet_cfg = configparser.ConfigParser() parsed_nnet_cfg.read(os.path.join(FLAGS.expdir, 'model', 'lm.cfg')) nnet_cfg = dict(parsed_nnet_cfg.items('lm')) #read the decoder config file if decoder_cfg_file is None: decoder_cfg_file = os.path.join(FLAGS.expdir, 'model', 'decoder.cfg') parsed_decoder_cfg = configparser.ConfigParser() parsed_decoder_cfg.read(decoder_cfg_file) decoder_cfg = dict(parsed_decoder_cfg.items('decoder')) #create the coder with open(os.path.join(FLAGS.expdir, 'model', 'alphabet')) as fid: alphabet = fid.read().split(' ') coder = target_coder.TargetCoder(alphabet) #read the maximum length with open(os.path.join(database_cfg['test_dir'], 'max_num_chars')) as fid: max_length = int(fid.read()) #create a text reader textreader = text_reader.TextReader(textfile=os.path.join( database_cfg['test_dir'], 'text'), max_length=max_length, coder=coder) #create the classifier classifier = lm_factory.factory(conf=nnet_cfg, output_dim=coder.num_labels) #create a decoder graph = tf.Graph() with graph.as_default(): decoder = decoder_factory.factory(conf=decoder_cfg, classifier=classifier, input_dim=1, max_input_length=max_length, coder=coder, expdir=FLAGS.expdir) saver = tf.train.Saver(tf.trainable_variables()) config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 config.allow_soft_placement = True with tf.Session(graph=graph, config=config) as sess: #load the model saver.restore(sess, os.path.join(FLAGS.expdir, 'model', 'network.ckpt')) #decode with te neural net decoded = decoder.decode(textreader, sess) #compute the character error rate score = decoder.score(decoded, None) print 'perplexity: %f' % score
def train_asr(clusterfile, job_name, task_index, ssh_command, expdir): ''' does everything for asr training Args: clusterfile: the file where all the machines in the cluster are specified if None, local training will be done job_name: one of ps or worker in the case of distributed training task_index: the task index in this job ssh_command: the command to use for ssh, if 'None' no tunnel will be created expdir: the experiments directory ''' #read the database config file parsed_database_cfg = configparser.ConfigParser() parsed_database_cfg.read(os.path.join(expdir, 'database.cfg')) database_cfg = dict(parsed_database_cfg.items('database')) #read the features config file parsed_feat_cfg = configparser.ConfigParser() parsed_feat_cfg.read(os.path.join(expdir, 'model', 'features.cfg')) feat_cfg = dict(parsed_feat_cfg.items('features')) #read the asr config file parsed_nnet_cfg = configparser.ConfigParser() parsed_nnet_cfg.read(os.path.join(expdir, 'model', 'asr.cfg')) nnet_cfg = dict(parsed_nnet_cfg.items('asr')) #read the trainer config file parsed_trainer_cfg = configparser.ConfigParser() parsed_trainer_cfg.read(os.path.join(expdir, 'trainer.cfg')) trainer_cfg = dict(parsed_trainer_cfg.items('trainer')) #read the decoder config file parsed_decoder_cfg = configparser.ConfigParser() parsed_decoder_cfg.read(os.path.join(expdir, 'model', 'decoder.cfg')) decoder_cfg = dict(parsed_decoder_cfg.items('decoder')) #create the cluster and server server = create_server.create_server(clusterfile=clusterfile, job_name=job_name, task_index=task_index, expdir=expdir, ssh_command=ssh_command) #the ps should just wait if job_name == 'ps': server.join() featdir = os.path.join(database_cfg['train_dir'], feat_cfg['name']) #create the coder with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid: alphabet = fid.read().split(' ') coder = target_coder.TargetCoder(alphabet) #create a feature reader for the training data with open(featdir + '/maxlength', 'r') as fid: max_length = int(fid.read()) featreader = feature_reader.FeatureReader(scpfile=featdir + '/feats_shuffled.scp', cmvnfile=featdir + '/cmvn.scp', utt2spkfile=featdir + '/utt2spk', max_length=max_length) #read the feature dimension with open(featdir + '/dim', 'r') as fid: input_dim = int(fid.read()) #the path to the text file textfile = os.path.join(database_cfg['train_dir'], 'targets') #create a batch dispenser for the training data dispenser = batchdispenser.AsrBatchDispenser( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) #create a reader for the validation data if 'dev_data' in database_cfg: featdir = database_cfg['dev_dir'] + '/' + feat_cfg['name'] with open(featdir + '/maxlength', 'r') as fid: max_length = int(fid.read()) val_reader = feature_reader.FeatureReader( scpfile=featdir + '/feats.scp', cmvnfile=featdir + '/cmvn.scp', utt2spkfile=featdir + '/utt2spk', max_length=max_length) textfile = os.path.join(database_cfg['dev_dir'], 'targets') #read the validation targets with open(textfile) as fid: lines = fid.readlines() val_targets = dict() for line in lines: splitline = line.strip().split(' ') val_targets[splitline[0]] = ' '.join(splitline[1:]) else: if int(trainer_cfg['valid_utt']) > 0: val_dispenser = dispenser.split(int(trainer_cfg['valid_utt'])) val_reader = val_dispenser.feature_reader val_targets = val_dispenser.target_dict else: val_reader = None val_targets = None #encode the validation targets if val_targets is not None: for utt in val_targets: val_targets[utt] = dispenser.target_coder.encode(val_targets[utt]) #create the classifier classifier = asr_factory.factory(conf=nnet_cfg, output_dim=coder.num_labels) #create the callable for the decoder decoder = partial(decoder_factory.factory, conf=decoder_cfg, classifier=classifier, input_dim=input_dim, max_input_length=val_reader.max_length, coder=coder, expdir=expdir) #create the trainer tr = trainer_factory.factory(conf=trainer_cfg, decoder=decoder, classifier=classifier, input_dim=input_dim, dispenser=dispenser, val_reader=val_reader, val_targets=val_targets, expdir=expdir, server=server, task_index=task_index) print 'starting training' #train the classifier tr.train()
def train_asr(clusterfile, job_name, task_index, ssh_command, expdir): ''' does everything for asr training Args: clusterfile: the file where all the machines in the cluster are specified if None, local training will be done job_name: one of ps or worker in the case of distributed training task_index: the task index in this job ssh_command: the command to use for ssh, if 'None' no tunnel will be created expdir: the experiments directory ''' #read the database config file parsed_database_cfg = configparser.ConfigParser() parsed_database_cfg.read(os.path.join(expdir, 'database.cfg')) database_cfg = dict(parsed_database_cfg.items('database')) #read the features config file parsed_feat_cfg = configparser.ConfigParser() parsed_feat_cfg.read(os.path.join(expdir, 'model', 'features.cfg')) feat_cfg = dict(parsed_feat_cfg.items('features')) #read the asr config file parsed_nnet_cfg = configparser.ConfigParser() parsed_nnet_cfg.read(os.path.join(expdir, 'model', 'asr.cfg')) nnet_cfg = dict(parsed_nnet_cfg.items('asr')) #read the trainer config file parsed_trainer_cfg = configparser.ConfigParser() parsed_trainer_cfg.read(os.path.join(expdir, 'trainer.cfg')) trainer_cfg = dict(parsed_trainer_cfg.items('trainer')) #read the decoder config file parsed_decoder_cfg = configparser.ConfigParser() parsed_decoder_cfg.read(os.path.join(expdir, 'model', 'decoder.cfg')) decoder_cfg = dict(parsed_decoder_cfg.items('decoder')) #make distinction between three implemented different kind of training forms if database_cfg['train_mode'] == 'supervised': nonsupervised = False elif database_cfg['train_mode'] == 'nonsupervised' or\ database_cfg['train_mode'] == 'semisupervised': nonsupervised = True else: raise Exception('Wrong kind of training mode') #when (partly) nonsupervised, what features are used for the reconstruction #currently two possible options implemented if nonsupervised: if trainer_cfg['reconstruction_features'] == 'audio_samples': audio_used = True elif trainer_cfg['reconstruction_features'] == 'input_features': audio_used = False else: raise Exception( 'Unknown specification for the reconstruction features') #read the quant config file if nonsupervised training and samples used if nonsupervised: if audio_used: parsed_quant_cfg = configparser.ConfigParser() parsed_quant_cfg.read(os.path.join(expdir, 'model', 'quantization.cfg')) quant_cfg = dict(parsed_quant_cfg.items('features')) #based on the other settings, compute and overwrite samples_per_hlfeature #and unpredictable_samples in the classifier config dictionary if nonsupervised: if audio_used: rate_after_quant = int(quant_cfg['quant_rate']) win_lenght = float(feat_cfg['winlen']) win_shift = float(feat_cfg['winstep']) samples_one_window = int(win_lenght*rate_after_quant) samples_one_shift = int(win_shift*rate_after_quant) #### THIS IS ONLY RELEVANT WHEN USING A LISTENER WITH PYRAM STRUCT # and this line should be adapted otherwise time_compression = 2**int(nnet_cfg['listener_numlayers']) #store values in config dictionary nnet_cfg['samples_per_hlfeature'] = samples_one_shift\ *time_compression nnet_cfg['unpredictable_samples'] = (samples_one_window+\ (time_compression-1)\ *samples_one_shift)-nnet_cfg['samples_per_hlfeature'] #create the cluster and server server = create_server.create_server( clusterfile=clusterfile, job_name=job_name, task_index=task_index, expdir=expdir, ssh_command=ssh_command) #the ps should just wait if job_name == 'ps': server.join() # path to where the training samples are stored featdir = os.path.join(database_cfg['train_dir'], feat_cfg['name']) #create the coder with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid: alphabet = fid.read().split(' ') coder = target_coder.TargetCoder(alphabet) #create a feature reader for the training data with open(featdir + '/maxlength', 'r') as fid: max_length = int(fid.read()) featreader = feature_reader.FeatureReader( scpfile=featdir + '/feats_shuffled.scp', cmvnfile=featdir + '/cmvn.scp', utt2spkfile=featdir + '/utt2spk', max_length=max_length) #read the feature dimension with open(featdir + '/dim', 'r') as fid: input_dim = int(fid.read()) #the path to the text file textfile = os.path.join(database_cfg['train_dir'], 'targets') # If nonsupervised and audio used, we also need to read samples # these can be done with a second feature reader if nonsupervised: if audio_used: featdir2 = os.path.join(database_cfg['train_dir'], quant_cfg['name']) with open(featdir2 + '/maxlength', 'r') as fid: max_length_audio = int(fid.read()) audioreader = feature_reader.FeatureReader( scpfile=featdir2 + '/feats_shuffled.scp', cmvnfile=None, utt2spkfile=None, max_length=max_length_audio) ## create a batch dispenser, depending on which situation we're in if not nonsupervised: # in the normal supervised training mode, regular dispenser is needed if 'las_ignoring_mode' in trainer_cfg: if trainer_cfg['las_ignoring_mode'] == 'True': # if we ignore unlabeled examples dispenser = batchdispenser.AsrTextBatchDispenser( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) elif trainer_cfg['las_ignoring_mode'] == 'False': # if we choose to process the unlabeled examples if 'fixed_ratio' in trainer_cfg: if trainer_cfg['fixed_ratio'] == 'True': # if we choose to process with batches with fixed # labeled/unlabeled ratio dispenser = \ batchdispenser.AsrTextBatchDispenserAltFixRatio( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile, percentage_unlabeled=1-float( database_cfg['part_labeled'])) elif trainer_cfg['fixed_ratio'] == 'False': # if the fixed ratio is not used dispenser = batchdispenser.AsrTextBatchDispenserAlt( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) else: raise Exception('wrong information in fixed_ratio var') else: # if fixed ratio is not specified, we choose to do without it dispenser = batchdispenser.AsrTextBatchDispenserAlt( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) else: raise Exception('wrong information in LAS_ignoring_mode var') else: # if no specification is made about the ignoring, ignore the unlabeled dispenser = batchdispenser.AsrTextBatchDispenser( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) else: # when doing (partly) nonsupervised extra reconstruction features needed if audio_used: # when the audio is the reconstruction feature if 'fixed_ratio' in trainer_cfg: if trainer_cfg['fixed_ratio'] == 'True': # if specified to work with fixed lab/unlab ratio batches dispenser = \ batchdispenser.AsrTextAndAudioBatchDispenserFixRatio( feature_reader=featreader, audio_reader=audioreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile, percentage_unlabeled=1-float( database_cfg['part_labeled'])) elif trainer_cfg['fixed_ratio'] == 'False': # if specified to not use the fixed ratio dispenser = batchdispenser.AsrTextAndAudioBatchDispenser( feature_reader=featreader, audio_reader=audioreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) else: raise Exception('wrong information in fixed_ratio var') else: # without specification, suppose no fixed ratio batches dispenser = batchdispenser.AsrTextAndAudioBatchDispenser( feature_reader=featreader, audio_reader=audioreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) else: # if no audio is used, the input features are used if 'fixed_ratio' in trainer_cfg: if trainer_cfg['fixed_ratio'] == 'True': # if specified to work with fixed labeled/unlabled ratio batches dispenser = \ batchdispenser.AsrTextAndFeatBatchDispenserFixRatio( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile, percentage_unlabeled=1-float( database_cfg['part_labeled'])) elif trainer_cfg['fixed_ratio'] == 'False': # if specified to not use the fixed ratio dispenser = batchdispenser.AsrTextAndFeatBatchDispenser( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) else: raise Exception('wrong information in fixed_ratio var') else: # without specification, suppose no fixed ratio batches dispenser = batchdispenser.AsrTextAndFeatBatchDispenser( feature_reader=featreader, target_coder=coder, size=int(trainer_cfg['batch_size']), target_path=textfile) # read validation data. If there are text targets, they are only important # for the validation data. If only nonsupervised, we must validate on the # reconstructed features if 'dev_data' in database_cfg: # create a reader for the validation inputs featdir = database_cfg['dev_dir'] + '/' + feat_cfg['name'] with open(featdir + '/maxlength', 'r') as fid: max_length = int(fid.read()) val_reader = feature_reader.FeatureReader( scpfile=featdir + '/feats.scp', cmvnfile=featdir + '/cmvn.scp', utt2spkfile=featdir + '/utt2spk', max_length=max_length) textfile = os.path.join(database_cfg['dev_dir'], 'targets') #read the validation text targets with open(textfile) as fid: lines = fid.readlines() val_text_targets = dict() for line in lines: splitline = line.strip().split(' ') val_text_targets[splitline[0]] = ' '.join(splitline[1:]) if nonsupervised: #also store the reconstruction targets val_rec_targets = dict() if audio_used: audiodir = database_cfg['dev_dir'] + '/' + quant_cfg['name'] with open(audiodir + '/maxlength', 'r') as fid: max_length_audio = int(fid.read()) val_audio_reader = feature_reader.FeatureReader( scpfile=audiodir + '/feats.scp', cmvnfile=None, utt2spkfile=audiodir + '/utt2spk', max_length=max_length_audio) for _ in range(val_audio_reader.num_utt): utt_id, audio, _ = val_audio_reader.get_utt() val_rec_targets[utt_id] = audio else: #input features are used for _ in range(val_reader.num_utt): utt_id, feat, _ = val_reader.get_utt() val_rec_targets[utt_id] = feat else: with open(textfile) as fid: lines = fid.readlines() val_rec_targets = dict() for line in lines: splitline = line.strip().split(' ') val_rec_targets[splitline[0]] = None val_targets = dict() for utt_id in val_text_targets: val_targets[utt_id] = (val_text_targets[utt_id], val_rec_targets[utt_id]) else: if int(trainer_cfg['valid_utt']) > 0: val_dispenser = dispenser.split(int(trainer_cfg['valid_utt'])) val_reader = val_dispenser.feature_reader val_targets = val_dispenser.target_dict else: val_reader = None val_targets = None #encode the validation targets if val_targets is not None: for utt in val_targets: val_targets[utt] = (dispenser.target_coder.encode( val_targets[utt][0]), val_targets[utt][1]) #create the classifier if nonsupervised: if audio_used: output_dim_second_el = int(quant_cfg['quant_levels']) else: # input features used output_dim_second_el = input_dim else: # only supervised training output_dim_second_el = None classifier = asr_factory.factory( conf=nnet_cfg, output_dim=(coder.num_labels, output_dim_second_el)) #create the callable for the decoder decoder = partial( decoder_factory.factory, conf=decoder_cfg, classifier=classifier, input_dim=input_dim, max_input_length=val_reader.max_length, coder=coder, expdir=expdir) #create the trainer if nonsupervised: if audio_used: reconstruction_dim = 1 else: reconstruction_dim = input_dim else: reconstruction_dim = 1 tr = trainer_factory.factory( conf=trainer_cfg, decoder=decoder, classifier=classifier, input_dim=input_dim, reconstruction_dim=reconstruction_dim, dispenser=dispenser, val_reader=val_reader, val_targets=val_targets, expdir=expdir, server=server, task_index=task_index) print 'starting training' #train the classifier tr.train()
def main(_): '''does everything for testing''' #decoder_cfg_file = 'config/decoder/attention_visualizer.cfg' decoder_cfg_file = None #read the database config file parsed_database_cfg = configparser.ConfigParser() parsed_database_cfg.read(os.path.join(FLAGS.expdir, 'database.cfg')) database_cfg = dict(parsed_database_cfg.items('database')) # check the training mode if database_cfg['train_mode'] == 'supervised': nonsupervised = False elif database_cfg['train_mode'] == 'semisupervised': nonsupervised = True elif database_cfg['train_mode'] == 'nonsupervised': raise Exception('Purely nonsupervised models should be tested with \ the test_reconstruction file.') else: raise Exception('Wrong kind of training mode') #read the features config file parsed_feat_cfg = configparser.ConfigParser() parsed_feat_cfg.read(os.path.join(FLAGS.expdir, 'model', 'features.cfg')) feat_cfg = dict(parsed_feat_cfg.items('features')) #read the asr config file parsed_nnet_cfg = configparser.ConfigParser() parsed_nnet_cfg.read(os.path.join(FLAGS.expdir, 'model', 'asr.cfg')) nnet_cfg = dict(parsed_nnet_cfg.items('asr')) # read the trainer config file parsed_trainer_cfg = configparser.ConfigParser() parsed_trainer_cfg.read(os.path.join(FLAGS.expdir, 'trainer.cfg')) trainer_cfg = dict(parsed_trainer_cfg.items('trainer')) #read the decoder config file if decoder_cfg_file is None: decoder_cfg_file = os.path.join(FLAGS.expdir, 'model', 'decoder.cfg') parsed_decoder_cfg = configparser.ConfigParser() parsed_decoder_cfg.read(decoder_cfg_file) decoder_cfg = dict(parsed_decoder_cfg.items('decoder')) # if (partly) nonsupervised, check what kind reconstruction features used # for now two options are implemented if nonsupervised: if trainer_cfg['reconstruction_features'] == 'audio_samples': audio_used = True else: audio_used = False if nonsupervised: if audio_used: #read the quantization config file if necessary parsed_quant_cfg = configparser.ConfigParser() parsed_quant_cfg.read(os.path.join(FLAGS.expdir, 'model', 'quantization.cfg')) quant_cfg = dict(parsed_quant_cfg.items('features')) #create a feature reader featdir = os.path.join(database_cfg['test_dir'], feat_cfg['name']) with open(os.path.join(featdir, 'maxlength'), 'r') as fid: max_length = int(fid.read()) reader = feature_reader.FeatureReader( scpfile=os.path.join(featdir, 'feats.scp'), cmvnfile=os.path.join(featdir, 'cmvn.scp'), utt2spkfile=os.path.join(featdir, 'utt2spk'), max_length=max_length) #read the feature dimension with open( os.path.join(database_cfg['train_dir'], feat_cfg['name'], 'dim'), 'r') as fid: input_dim = int(fid.read()) #create the coder with open(os.path.join(database_cfg['train_dir'], 'alphabet')) as fid: alphabet = fid.read().split(' ') coder = target_coder.TargetCoder(alphabet) #create the classifier if not nonsupervised: outputdim2 = 1 else: if audio_used: outputdim2 = int(quant_cfg['quant_levels']) else: #then input features used outputdim2 = input_dim classifier = asr_factory.factory( conf=nnet_cfg, output_dim=(coder.num_labels, outputdim2)) #create a decoder graph = tf.Graph() with graph.as_default(): decoder = decoder_factory.factory( conf=decoder_cfg, classifier=classifier, input_dim=input_dim, max_input_length=reader.max_length, coder=coder, expdir=FLAGS.expdir) saver = tf.train.Saver(tf.trainable_variables()) config = tf.ConfigProto() config.gpu_options.allow_growth = True #pylint: disable=E1101 config.allow_soft_placement = True with tf.Session(graph=graph, config=config) as sess: #load the model saver.restore(sess, os.path.join(FLAGS.expdir, 'model', 'network.ckpt')) #decode with te neural net decoded = decoder.decode(reader, sess) #the path to the text file textfile = database_cfg['testtext'] #read all the reference transcriptions with open(textfile) as fid: lines = fid.readlines() references = dict() for line in lines: splitline = line.strip().split(' ') references[splitline[0]] = coder.encode(' '.join(splitline[1:])) #compute the character error rate score = decoder.score(decoded, references) print 'score: %f' % score #write the resulting beams to disk decodedir = os.path.join(FLAGS.expdir, 'decoded') if not os.path.isdir(decodedir): os.makedirs(decodedir) for utt in decoded: with open(os.path.join(decodedir, utt), 'w') as fid: for hypothesis in decoded[utt]: fid.write('%f\t%s\n' % (hypothesis[0], hypothesis[1]))