def _CreateAsrFeatures(): # First pass: extract transcription files. if False: #os.path.exists(FLAGS.transcripts_filepath): trans = _LoadTranscriptionsFromFile() else: tf.logging.info('Running first pass on the fly') trans = _ReadTranscriptionsFromCSV() total_utts = len(trans) tf.logging.info('Total transcripts: %d', len(trans)) tf_bytes = tf.placeholder(dtype=tf.string) log_mel = audio_lib.ExtractLogMelFeatures(tf_bytes) # Second pass: transcode the flac. file_obj = tf.io.gfile.GFile(FLAGS.input_tarball, mode='rb') tar = tarfile.open(fileobj=file_obj, mode='r:gz') n = 0 recordio_writers = _OpenSubShards() tfconf = tf.config_pb2.ConfigProto() tfconf.gpu_options.allow_growth = True with tf.Session(config=tfconf) as sess: for tarinfo in tar: # We can actually decode essentially any audio format, but we # want to avoid non-audio data. Thus, this condition. if not (tarinfo.name.endswith('.flac') or tarinfo.name.endswith('.wav') or tarinfo.name.endswith('.mp3')): continue n += 1 if n % FLAGS.num_shards != FLAGS.shard_id: continue f = tar.extractfile(tarinfo) fmt = tarinfo.name.split('.')[-1] uttid = tarinfo.name audio_bytes = f.read() f.close() try: wav_bytes = audio_lib.DecodeToWav(audio_bytes, fmt) frames = sess.run(log_mel, feed_dict={tf_bytes: wav_bytes}) except Exception as e: # raise trans.pop(uttid) tf.logging.info(f'{uttid} FAILED featurization') continue assert uttid in trans, uttid num_words = len(trans[uttid]) tf.logging.info('utt[%d]: %s [%d frames, %d chars]', n, uttid, frames.shape[1], num_words) ex = _MakeTfExample(uttid, frames, trans[uttid]) outf = _SelectRandomShard(recordio_writers) outf.write(ex.SerializeToString()) tar.close() file_obj.close() _CloseSubShards(recordio_writers) tf.logging.info(f'Processed {len(trans)} / {total_utts}')
def testExtractLogMelFeatures(self): with open( test_helper.test_src_dir_path( 'tools/testdata/gan_or_vae.16k.wav'), 'rb') as f: wav = f.read() wav_bytes_t = tf.constant(wav, dtype=tf.string) log_mel_t = audio_lib.ExtractLogMelFeatures(wav_bytes_t) with self.session() as sess: log_mel = sess.run(log_mel_t) # Expect 314, 80 dimensional channels. self.assertAllEqual(log_mel.shape, [1, 314, 80, 1])
def testExtractLogMelFeatures(self): with open( test_helper.test_src_dir_path( 'tools/testdata/gan_or_vae.16k.wav'), 'r') as f: wav = f.read() wav_bytes_t = tf.constant(wav, dtype=tf.string) log_mel_t = audio_lib.ExtractLogMelFeatures(wav_bytes_t) with self.session() as sess: log_mel = sess.run(log_mel_t) # We expect 105 frames, each of which consists of three 80 dimensional # stacked frames. self.assertAllEqual(log_mel.shape, [1, 105, 80 * 3, 1])
def _CreateAsrFeatures(): # First pass: extract transcription files. if os.path.exists(FLAGS.transcripts_filepath): trans = _LoadTranscriptionsFromFile() else: tf.logging.info('Running first pass on the fly') trans = _ReadTranscriptions() tf.logging.info('Total transcripts: %d', len(trans)) tf_bytes = tf.placeholder(dtype=tf.string) # Great! It uses the frontend directly log_mel = audio_lib.ExtractLogMelFeatures(tf_bytes) # Second pass: transcode the flac. file_obj = tf.io.gfile.GFile(FLAGS.input_tarball, mode='rb') tar = tarfile.open(fileobj=file_obj, mode='r:gz') n = 0 recordio_writers = _OpenSubShards() tfconf = tf.config_pb2.ConfigProto() tfconf.gpu_options.allow_growth = True with tf.Session(config=tfconf) as sess: for tarinfo in tar: if not tarinfo.name.endswith('.flac'): continue n += 1 if n % FLAGS.num_shards != FLAGS.shard_id: continue uttid = re.sub('.*/(.+)\\.flac', '\\1', tarinfo.name) f = tar.extractfile(tarinfo) wav_bytes = audio_lib.DecodeFlacToWav(f.read()) f.close() frames = sess.run(log_mel, feed_dict={tf_bytes: wav_bytes}) assert uttid in trans, uttid num_words = len(trans[uttid]) tf.logging.info('utt[%d]: %s [%d frames, %d words]', n, uttid, frames.shape[1], num_words) ex = _MakeTfExample(uttid, frames, trans[uttid]) outf = _SelectRandomShard(recordio_writers) outf.write(ex.SerializeToString()) tar.close() file_obj.close() _CloseSubShards(recordio_writers)