def extract_feature(waveforms, params): '''extract fbank with delta-delta and do cmvn waveforms: [batch, samples] ''' p = params with tf.variable_scope('feature_extractor'): mel_fbanks = extract_logfbank_with_delta(waveforms, params) # shape: [1, nframes, nbins, nchannels] fbank_size = utils.shape_list(mel_fbanks) #assert fbank_size[0] == 1 # This replaces CMVN estimation on data if not p.audio_global_cmvn: mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) variance = tf.reduce_mean(tf.square(mel_fbanks - mean), keepdims=True, axis=1) else: assert p.audio_cmvn_path, p.audio_cmvn_path mean, variance = utils.load_cmvn(p.audio_cmvn_path) var_epsilon = 1e-09 mel_fbanks = utils.apply_cmvn(mel_fbanks, mean, variance, var_epsilon) # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. batch_size = fbank_size[0] feats = tf.concat([ tf.reshape( mel_fbanks, [batch_size, fbank_size[1], fbank_size[2], fbank_size[3]]), tf.zeros((batch_size, p.num_zeropad_frames, fbank_size[2], fbank_size[3])) ], 1) return feats # shape [batch_size, nframes, featue_size, chnanels]
def preprocess(self, inputs): ''' Speech preprocessing. ''' with tf.variable_scope('feature'): if self.input_type == 'samples': # FIXME: stub feats = None else: if 'cmvn_type' in self.audioconf: cmvn_type = self.audioconf['cmvn_type'] else: cmvn_type = 'global' logging.info('cmvn_type: %s' % (cmvn_type)) if cmvn_type == 'global': self.mean, self.std = utils.load_cmvn( self.audioconf['cmvn_path']) feats = utils.apply_cmvn(inputs, self.mean, self.std) elif cmvn_type == 'local': feats = utils.apply_local_cmvn(inputs) elif cmvn_type == 'sliding': raise ValueError('cmvn_type %s not implemented yet.' % (cmvn_type)) elif cmvn_type == 'none': feats = inputs else: raise ValueError('Error cmvn_type %s.' % (cmvn_type)) return feats
def testLoadCmvn(self): #pylint: disable=invalid-name ''' test load cmvn ''' np.random.seed(12) temp_dir = self.get_temp_dir() temp_file = os.path.join(temp_dir, 'cmvn.npy') feat_size = 40 delta_deltas = True shape = [1, feat_size, 3 if delta_deltas else 1] mean = np.random.randn(*shape) var = np.random.randn(*shape) mean, var = mean.astype(np.float32), var.astype(np.float32) with tf.gfile.Open(temp_file, 'w') as f: #pylint: disable=invalid-name np.save(f, (mean, var)) mean_true = np.expand_dims(mean, axis=0) var_true = np.expand_dims(var, axis=0) with self.session(use_gpu=False, force_gpu=False): mean, var = utils.load_cmvn(temp_file) self.assertAllClose(mean.eval(), mean_true) self.assertAllClose(var.eval(), var_true)
def preprocess(self, inputs, input_text): ''' preprocess speech and text inputs params: inputs: speech input input_text: text input ''' with tf.variable_scope('feature'): if self.input_type == 'samples': # speech feature config self.hp = speech_params( sr=self.taskconf['audio']['sr'], bins=self.audioconf['feature_size'], dither=self.train, use_delta_deltas=self.audioconf['add_delta_deltas'], cmvn=self.audioconf['cmvn'], cmvn_path=self.audioconf['cmvn_path']) feats = extract_feature(inputs, params=self.hp) else: self.mean, self.std = utils.load_cmvn(self.audioconf['cmvn_path']) feats = utils.apply_cmvn(inputs, self.mean, self.std) return feats, input_text
def __init__(self, cmvn_path): super().__init__(name='cmvn', trainable=False) self.mean, self.std = utils.load_cmvn(cmvn_path)
def main(): data = dataset.make_dataset('train', config.train_path, config.train_textgrid_path, FLAGS) train_data = dataloader.input_func(data, FLAGS.batch_size, is_train=True, num_epoch=1) data = dataset.make_dataset('dev', config.dev_path, config.dev_textgrid_path, FLAGS) dev_data = dataloader.input_func(data, FLAGS.batch_size, is_train=False) # create model and optimizer step_counter = tf.train.get_or_create_global_step() model = model_lib.Emotion(drop_rate=0.1) lr = tf.train.exponential_decay(FLAGS.learning_rate, step_counter, 100, FLAGS.decay_rate, staircase=True) optimizer = tf.train.AdamOptimizer(lr) print('init lr', lr().numpy()) # checkpoint dirs if FLAGS.checkpoint: train_dir = os.path.join(FLAGS.checkpoint, 'train') eval_dir = os.path.join(FLAGS.checkpoint, 'eval') tf.gfile.MakeDirs(FLAGS.checkpoint) else: train_dir = None eval_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) eval_summary_writer = tf.contrib.summary.create_file_writer( eval_dir, flush_millis=10000, name='eval') # create and restore checkpoint ( if one exists on the graph) checkpoint_prefix = os.path.join(FLAGS.checkpoint, 'ckpt') checkpoint = tf.train.Checkpoint( # model=model, optimizer=optimizer, learning_rate=lr, step_counter=step_counter) model=model, optimizer=optimizer, step_counter=step_counter) # restore variables on creation if a checkpoint exists. stats = checkpoint.restore(tf.train.latest_checkpoint(FLAGS.checkpoint)) #stats.assert_consumed() print('now lr', lr().numpy()) cmvn = utils.load_cmvn(FLAGS.cmvn_path) device = '/gpu:0' if tf.test.is_gpu_available() else '/cpu:0' print("Using device %s" % (device)) with tf.device(device): for e in range(FLAGS.num_epochs): # train start = time.time() with summary_writer.as_default(): train_one_epoch(model, optimizer, train_data, step_counter, cmvn, log_interval=100) end = time.time() print( '\nTrain time for epoch #%d (%d total steps) (%f learning rate): %f' % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(), lr().numpy(), end - start)) if e == 0: print_vars(model) # eval with eval_summary_writer.as_default(): eval(model, dev_data, cmvn) checkpoint.save(checkpoint_prefix)