def testCreateModelConvInference(self): model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40) with self.test_session() as sess: fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]]) logits = models.create_model(fingerprint_input, model_settings, "conv", False) self.assertIsNotNone(logits) self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
def _modelSettings(self): return models.prepare_model_settings(label_count=10, sample_rate=16000, clip_duration_ms=1000, window_size_ms=20, window_stride_ms=10, feature_bin_count=40, preprocess="mfcc")
def testCreateModelBadArchitecture(self): model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40) with self.test_session(): fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]]) with self.assertRaises(Exception) as e: models.create_model(fingerprint_input, model_settings, "bad_architecture", True) self.assertTrue("not recognized" in str(e.exception))
def testPrepareModelSettings(self): self.assertIsNotNone( models.prepare_model_settings(label_count=10, sample_rate=16000, clip_duration_ms=1000, window_size_ms=20, window_stride_ms=10, feature_bin_count=40, preprocess="mfcc"))
def _modelSettings(self): return models.prepare_model_settings( label_count=10, sample_rate=16000, clip_duration_ms=1000, window_size_ms=20, window_stride_ms=10, feature_bin_count=40, preprocess="mfcc")
def testCreateModelBadArchitecture(self): model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40) with self.test_session(): fingerprint_input = tf.zeros( [1, model_settings["fingerprint_size"]]) with self.assertRaises(Exception) as e: models.create_model(fingerprint_input, model_settings, "bad_architecture", True) self.assertTrue("not recognized" in str(e.exception))
def testPrepareModelSettings(self): self.assertIsNotNone( models.prepare_model_settings( label_count=10, sample_rate=16000, clip_duration_ms=1000, window_size_ms=20, window_stride_ms=10, feature_bin_count=40, preprocess="mfcc"))
def testCreateModelConvInference(self): model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40) with self.test_session() as sess: fingerprint_input = tf.zeros( [1, model_settings["fingerprint_size"]]) logits = models.create_model(fingerprint_input, model_settings, "conv", False) self.assertIsNotNone(logits) self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
def testCreateModelFullyConnectedTraining(self): model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40) with self.test_session() as sess: fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]]) logits, dropout_prob = models.create_model( fingerprint_input, model_settings, "single_fc", True) self.assertIsNotNone(logits) self.assertIsNotNone(dropout_prob) self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name)) self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape(fingerprint_input, [ -1, fingerprint_time_size * fingerprint_frequency_size ]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def testCreateModelFullyConnectedTraining(self): model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40) with self.test_session() as sess: fingerprint_input = tf.zeros( [1, model_settings["fingerprint_size"]]) logits, dropout_prob = models.create_model(fingerprint_input, model_settings, "single_fc", True) self.assertIsNotNone(logits) self.assertIsNotNone(dropout_prob) self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name)) self.assertIsNotNone( sess.graph.get_tensor_by_name(dropout_prob.name))
def _runGetDataTest(self, preprocess, window_length_ms): tmp_dir = self.get_temp_dir() wav_dir = os.path.join(tmp_dir, "wavs") os.mkdir(wav_dir) self._saveWavFolders(wav_dir, ["a", "b", "c"], 100) background_dir = os.path.join(wav_dir, "_background_noise_") os.mkdir(background_dir) wav_data = self._getWavData() for i in range(10): file_path = os.path.join(background_dir, "background_audio_%d.wav" % i) self._saveTestWavFile(file_path, wav_data) model_settings = models.prepare_model_settings( 4, 16000, 1000, window_length_ms, 20, 40, preprocess) with self.cached_session() as sess: audio_processor = input_data.AudioProcessor( "", wav_dir, 10, 10, ["a", "b"], 10, 10, model_settings, tmp_dir) result_data, result_labels = audio_processor.get_data( 10, 0, model_settings, 0.3, 0.1, 100, "training", sess) self.assertEqual(10, len(result_data)) self.assertEqual(10, len(result_labels))
def testPrepareModelSettings(self): self.assertIsNotNone( models.prepare_model_settings(10, 16000, 1000, 20, 10, 40))
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [], name='wav_data') decoded_sample_data = tf.audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = audio_ops.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = audio_ops.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`') sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast( tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def main(_): words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor( '', FLAGS.data_dir, FLAGS.silence_percentage, 10, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32) # Set up background audio. background_crossover_ms = 500 background_segment_duration_ms = ( FLAGS.clip_duration_ms + background_crossover_ms) background_segment_duration_samples = int( (background_segment_duration_ms * FLAGS.sample_rate) / 1000) background_segment_stride_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) background_ramp_samples = int( ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000) # Mix the background audio into the main track. how_many_backgrounds = int( math.ceil(output_audio_sample_count / background_segment_stride_samples)) for i in range(how_many_backgrounds): output_offset = int(i * background_segment_stride_samples) background_index = np.random.randint(len(audio_processor.background_data)) background_samples = audio_processor.background_data[background_index] background_offset = np.random.randint( 0, len(background_samples) - model_settings['desired_samples']) background_volume = np.random.uniform(0, FLAGS.background_volume) mix_in_audio_sample(output_audio, output_offset, background_samples, background_offset, background_segment_duration_samples, background_volume, background_ramp_samples, background_ramp_samples) # Mix the words into the main track, noting their labels and positions. output_labels = [] word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000) clip_duration_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000) how_many_words = int( math.floor(output_audio_sample_count / word_stride_samples)) all_test_data, all_test_labels = audio_processor.get_unprocessed_data( -1, model_settings, 'testing') for i in range(how_many_words): output_offset = ( int(i * word_stride_samples) + np.random.randint(word_gap_samples)) output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate is_unknown = np.random.randint(100) < FLAGS.unknown_percentage if is_unknown: wanted_label = input_data.UNKNOWN_WORD_LABEL else: wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)] test_data_start = np.random.randint(len(all_test_data)) found_sample_data = None index_lookup = np.arange(len(all_test_data), dtype=np.int32) np.random.shuffle(index_lookup) for test_data_offset in range(len(all_test_data)): test_data_index = index_lookup[( test_data_start + test_data_offset) % len(all_test_data)] current_label = all_test_labels[test_data_index] if current_label == wanted_label: found_sample_data = all_test_data[test_data_index] break mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0, clip_duration_samples, 1.0, 500, 500) output_labels.append({'label': wanted_label, 'time': output_offset_ms}) input_data.save_wav_file(FLAGS.output_audio_file, output_audio, FLAGS.sample_rate) tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file) with open(FLAGS.output_labels_file, 'w') as f: for output_label in output_labels: f.write('%s, %f\n' % (output_label['label'], output_label['time'])) tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)
def wav_to_features(sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, quantize, preprocess, input_wav, output_c_file): """Converts an audio file into its corresponding feature map. Args: sample_rate: Expected sample rate of the wavs. clip_duration_ms: Expected duration in milliseconds of the wavs. window_size_ms: How long each spectrogram timeslice is. window_stride_ms: How far to move in time between spectrogram timeslices. feature_bin_count: How many bins to use for the feature fingerprint. quantize: Whether to train the model for eight-bit deployment. preprocess: Spectrogram processing mode; "mfcc", "average" or "micro". input_wav: Path to the audio WAV file to read. output_c_file: Where to save the generated C source file. """ # Start a new TensorFlow session. sess = tf.compat.v1.InteractiveSession() model_settings = models.prepare_model_settings( 0, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) audio_processor = input_data.AudioProcessor(None, None, 0, 0, '', 0, 0, model_settings, None) results = audio_processor.get_features_for_wav(input_wav, model_settings, sess) features = results[0] variable_base = os.path.splitext(os.path.basename(input_wav).lower())[0] # Save a C source file containing the feature data as an array. with gfile.GFile(output_c_file, 'w') as f: f.write('/* File automatically created by\n') f.write(' * tensorflow/examples/speech_commands/wav_to_features.py \\\n') f.write(' * --sample_rate=%d \\\n' % sample_rate) f.write(' * --clip_duration_ms=%d \\\n' % clip_duration_ms) f.write(' * --window_size_ms=%d \\\n' % window_size_ms) f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms) f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count) if quantize: f.write(' * --quantize=1 \\\n') f.write(' * --preprocess="%s" \\\n' % preprocess) f.write(' * --input_wav="%s" \\\n' % input_wav) f.write(' * --output_c_file="%s" \\\n' % output_c_file) f.write(' */\n\n') f.write('const int g_%s_width = %d;\n' % (variable_base, model_settings['fingerprint_width'])) f.write('const int g_%s_height = %d;\n' % (variable_base, model_settings['spectrogram_length'])) if quantize: features_min, features_max = input_data.get_features_range(model_settings) f.write('const unsigned char g_%s_data[] = {' % variable_base) i = 0 for value in features.flatten(): quantized_value = int( round( (255 * (value - features_min)) / (features_max - features_min))) if quantized_value < 0: quantized_value = 0 if quantized_value > 255: quantized_value = 255 if i == 0: f.write('\n ') f.write('%d, ' % (quantized_value)) i = (i + 1) % 10 else: f.write('const float g_%s_data[] = {\n' % variable_base) i = 0 for value in features.flatten(): if i == 0: f.write('\n ') f.write(' ,%f' % value) i = (i + 1) % 10 f.write('\n};\n')
def main(_): # Set the verbosity based on flags (default is INFO, so we see all messages) tf.compat.v1.logging.set_verbosity(FLAGS.verbosity) # Start a new TensorFlow session. sess = tf.compat.v1.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess) audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings, FLAGS.summaries_dir) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list( map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) input_placeholder = tf.compat.v1.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') if FLAGS.quantize: fingerprint_min, fingerprint_max = input_data.get_features_range( model_settings) fingerprint_input = tf.quantization.fake_quant_with_min_max_args( input_placeholder, fingerprint_min, fingerprint_max) else: fingerprint_input = input_placeholder logits, dropout_prob = models.create_model(fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer ground_truth_input = tf.compat.v1.placeholder(tf.int64, [None], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.compat.v1.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. with tf.compat.v1.name_scope('cross_entropy'): cross_entropy_mean = tf.compat.v1.losses.sparse_softmax_cross_entropy( labels=ground_truth_input, logits=logits) if FLAGS.quantize: try: tf.contrib.quantize.create_training_graph(quant_delay=0) except AttributeError as e: msg = e.args[0] msg += ( '\n\n The --quantize option still requires contrib, which is not ' 'part of TensorFlow 2.0. Please install a previous version:' '\n `pip install tensorflow<=1.15`') e.args = (msg, ) raise e with tf.compat.v1.name_scope('train'), tf.control_dependencies( control_dependencies): learning_rate_input = tf.compat.v1.placeholder( tf.float32, [], name='learning_rate_input') if FLAGS.optimizer == 'gradient_descent': train_step = tf.compat.v1.train.GradientDescentOptimizer( learning_rate_input).minimize(cross_entropy_mean) elif FLAGS.optimizer == 'momentum': train_step = tf.compat.v1.train.MomentumOptimizer( learning_rate_input, .9, use_nesterov=True).minimize(cross_entropy_mean) else: raise Exception('Invalid Optimizer') predicted_indices = tf.argmax(input=logits, axis=1) correct_prediction = tf.equal(predicted_indices, ground_truth_input) confusion_matrix = tf.math.confusion_matrix(labels=ground_truth_input, predictions=predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean( input_tensor=tf.cast(correct_prediction, tf.float32)) with tf.compat.v1.get_default_graph().name_scope('eval'): tf.compat.v1.summary.scalar('cross_entropy', cross_entropy_mean) tf.compat.v1.summary.scalar('accuracy', evaluation_step) global_step = tf.compat.v1.train.get_or_create_global_step() increment_global_step = tf.compat.v1.assign(global_step, global_step + 1) saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.compat.v1.summary.merge_all(scope='eval') train_writer = tf.compat.v1.summary.FileWriter( FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.compat.v1.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.compat.v1.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.compat.v1.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.io.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) # Training loop. training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step, ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, dropout_prob: 0.5 }) train_writer.add_summary(train_summary, training_step) tf.compat.v1.logging.info( 'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, conf_matrix = sess.run( [merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.compat.v1.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.compat.v1.logging.info( 'Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.compat.v1.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) set_size = audio_processor.set_size('testing') tf.compat.v1.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.compat.v1.logging.warn('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.compat.v1.logging.warn('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
def main(_): words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(',')) model_settings = models.prepare_model_settings(len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor('', FLAGS.data_dir, FLAGS.silence_percentage, 10, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds output_audio = np.zeros((output_audio_sample_count, ), dtype=np.float32) # Set up background audio. background_crossover_ms = 500 background_segment_duration_ms = (FLAGS.clip_duration_ms + background_crossover_ms) background_segment_duration_samples = int( (background_segment_duration_ms * FLAGS.sample_rate) / 1000) background_segment_stride_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) background_ramp_samples = int( ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000) # Mix the background audio into the main track. how_many_backgrounds = int( math.ceil(output_audio_sample_count / background_segment_stride_samples)) for i in range(how_many_backgrounds): output_offset = int(i * background_segment_stride_samples) background_index = np.random.randint( len(audio_processor.background_data)) background_samples = audio_processor.background_data[background_index] background_offset = np.random.randint( 0, len(background_samples) - model_settings['desired_samples']) background_volume = np.random.uniform(0, FLAGS.background_volume) mix_in_audio_sample(output_audio, output_offset, background_samples, background_offset, background_segment_duration_samples, background_volume, background_ramp_samples, background_ramp_samples) # Mix the words into the main track, noting their labels and positions. output_labels = [] word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000) clip_duration_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000) how_many_words = int( math.floor(output_audio_sample_count / word_stride_samples)) all_test_data, all_test_labels = audio_processor.get_unprocessed_data( -1, model_settings, 'testing') for i in range(how_many_words): output_offset = (int(i * word_stride_samples) + np.random.randint(word_gap_samples)) output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate is_unknown = np.random.randint(100) < FLAGS.unknown_percentage if is_unknown: wanted_label = input_data.UNKNOWN_WORD_LABEL else: wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)] test_data_start = np.random.randint(len(all_test_data)) found_sample_data = None index_lookup = np.arange(len(all_test_data), dtype=np.int32) np.random.shuffle(index_lookup) for test_data_offset in range(len(all_test_data)): test_data_index = index_lookup[(test_data_start + test_data_offset) % len(all_test_data)] current_label = all_test_labels[test_data_index] if current_label == wanted_label: found_sample_data = all_test_data[test_data_index] break mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0, clip_duration_samples, 1.0, 500, 500) output_labels.append({'label': wanted_label, 'time': output_offset_ms}) input_data.save_wav_file(FLAGS.output_audio_file, output_audio, FLAGS.sample_rate) tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file) with open(FLAGS.output_labels_file, 'w') as f: for output_label in output_labels: f.write('%s, %f\n' % (output_label['label'], output_label['time'])) tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)