示例#1
0
 def testCreateModelConvInference(self):
   model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40)
   with self.test_session() as sess:
     fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]])
     logits = models.create_model(fingerprint_input, model_settings, "conv",
                                  False)
     self.assertIsNotNone(logits)
     self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
示例#2
0
 def _modelSettings(self):
     return models.prepare_model_settings(label_count=10,
                                          sample_rate=16000,
                                          clip_duration_ms=1000,
                                          window_size_ms=20,
                                          window_stride_ms=10,
                                          feature_bin_count=40,
                                          preprocess="mfcc")
示例#3
0
 def testCreateModelBadArchitecture(self):
   model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40)
   with self.test_session():
     fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]])
     with self.assertRaises(Exception) as e:
       models.create_model(fingerprint_input, model_settings,
                           "bad_architecture", True)
     self.assertTrue("not recognized" in str(e.exception))
示例#4
0
 def testPrepareModelSettings(self):
     self.assertIsNotNone(
         models.prepare_model_settings(label_count=10,
                                       sample_rate=16000,
                                       clip_duration_ms=1000,
                                       window_size_ms=20,
                                       window_stride_ms=10,
                                       feature_bin_count=40,
                                       preprocess="mfcc"))
示例#5
0
 def _modelSettings(self):
   return models.prepare_model_settings(
       label_count=10,
       sample_rate=16000,
       clip_duration_ms=1000,
       window_size_ms=20,
       window_stride_ms=10,
       feature_bin_count=40,
       preprocess="mfcc")
示例#6
0
 def testCreateModelBadArchitecture(self):
     model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10,
                                                    40)
     with self.test_session():
         fingerprint_input = tf.zeros(
             [1, model_settings["fingerprint_size"]])
         with self.assertRaises(Exception) as e:
             models.create_model(fingerprint_input, model_settings,
                                 "bad_architecture", True)
         self.assertTrue("not recognized" in str(e.exception))
示例#7
0
 def testPrepareModelSettings(self):
   self.assertIsNotNone(
       models.prepare_model_settings(
           label_count=10,
           sample_rate=16000,
           clip_duration_ms=1000,
           window_size_ms=20,
           window_stride_ms=10,
           feature_bin_count=40,
           preprocess="mfcc"))
示例#8
0
 def testCreateModelConvInference(self):
     model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10,
                                                    40)
     with self.test_session() as sess:
         fingerprint_input = tf.zeros(
             [1, model_settings["fingerprint_size"]])
         logits = models.create_model(fingerprint_input, model_settings,
                                      "conv", False)
         self.assertIsNotNone(logits)
         self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
示例#9
0
 def testCreateModelFullyConnectedTraining(self):
   model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40)
   with self.test_session() as sess:
     fingerprint_input = tf.zeros([1, model_settings["fingerprint_size"]])
     logits, dropout_prob = models.create_model(
         fingerprint_input, model_settings, "single_fc", True)
     self.assertIsNotNone(logits)
     self.assertIsNotNone(dropout_prob)
     self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
     self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
示例#10
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)
  fingerprint_input = contrib_audio.mfcc(
      spectrogram,
      decoded_sample_data.sample_rate,
      dct_coefficient_count=dct_coefficient_count)
  fingerprint_frequency_size = model_settings['dct_coefficient_count']
  fingerprint_time_size = model_settings['spectrogram_length']
  reshaped_input = tf.reshape(fingerprint_input, [
      -1, fingerprint_time_size * fingerprint_frequency_size
  ])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
示例#11
0
 def testCreateModelFullyConnectedTraining(self):
     model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10,
                                                    40)
     with self.test_session() as sess:
         fingerprint_input = tf.zeros(
             [1, model_settings["fingerprint_size"]])
         logits, dropout_prob = models.create_model(fingerprint_input,
                                                    model_settings,
                                                    "single_fc", True)
         self.assertIsNotNone(logits)
         self.assertIsNotNone(dropout_prob)
         self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
         self.assertIsNotNone(
             sess.graph.get_tensor_by_name(dropout_prob.name))
 def _runGetDataTest(self, preprocess, window_length_ms):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
     os.mkdir(wav_dir)
     self._saveWavFolders(wav_dir, ["a", "b", "c"], 100)
     background_dir = os.path.join(wav_dir, "_background_noise_")
     os.mkdir(background_dir)
     wav_data = self._getWavData()
     for i in range(10):
         file_path = os.path.join(background_dir, "background_audio_%d.wav" % i)
         self._saveTestWavFile(file_path, wav_data)
     model_settings = models.prepare_model_settings(
         4, 16000, 1000, window_length_ms, 20, 40, preprocess)
     with self.cached_session() as sess:
         audio_processor = input_data.AudioProcessor(
             "", wav_dir, 10, 10, ["a", "b"], 10, 10, model_settings, tmp_dir)
         result_data, result_labels = audio_processor.get_data(
             10, 0, model_settings, 0.3, 0.1, 100, "training", sess)
         self.assertEqual(10, len(result_data))
         self.assertEqual(10, len(result_labels))
 def _runGetDataTest(self, preprocess, window_length_ms):
   tmp_dir = self.get_temp_dir()
   wav_dir = os.path.join(tmp_dir, "wavs")
   os.mkdir(wav_dir)
   self._saveWavFolders(wav_dir, ["a", "b", "c"], 100)
   background_dir = os.path.join(wav_dir, "_background_noise_")
   os.mkdir(background_dir)
   wav_data = self._getWavData()
   for i in range(10):
     file_path = os.path.join(background_dir, "background_audio_%d.wav" % i)
     self._saveTestWavFile(file_path, wav_data)
   model_settings = models.prepare_model_settings(
       4, 16000, 1000, window_length_ms, 20, 40, preprocess)
   with self.cached_session() as sess:
     audio_processor = input_data.AudioProcessor(
         "", wav_dir, 10, 10, ["a", "b"], 10, 10, model_settings, tmp_dir)
     result_data, result_labels = audio_processor.get_data(
         10, 0, model_settings, 0.3, 0.1, 100, "training", sess)
     self.assertEqual(10, len(result_data))
     self.assertEqual(10, len(result_labels))
示例#14
0
 def testPrepareModelSettings(self):
     self.assertIsNotNone(
         models.prepare_model_settings(10, 16000, 1000, 20, 10, 40))
示例#15
0
 def testPrepareModelSettings(self):
   self.assertIsNotNone(
       models.prepare_model_settings(10, 16000, 1000, 20, 10, 40))
示例#16
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [],
                                                  name='wav_data')
  decoded_sample_data = tf.audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = audio_ops.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        input=tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = audio_ops.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  elif preprocess == 'micro':
    if not frontend_op:
      raise Exception(
          'Micro frontend op is currently not available when running TensorFlow'
          ' directly from Python, you need to build and run through Bazel, for'
          ' example'
          ' `bazel run tensorflow/examples/speech_commands:freeze_graph`')
    sample_rate = model_settings['sample_rate']
    window_size_ms = (model_settings['window_size_samples'] *
                      1000) / sample_rate
    window_step_ms = (model_settings['window_stride_samples'] *
                      1000) / sample_rate
    int16_input = tf.cast(
        tf.multiply(decoded_sample_data.audio, 32767), tf.int16)
    micro_frontend = frontend_op.audio_microfrontend(
        int16_input,
        sample_rate=sample_rate,
        window_size=window_size_ms,
        window_step=window_step_ms,
        num_channels=model_settings['fingerprint_width'],
        out_scale=1,
        out_type=tf.float32)
    fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                    ' "average", or "micro")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
def main(_):
  words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms,
      FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
  audio_processor = input_data.AudioProcessor(
      '', FLAGS.data_dir, FLAGS.silence_percentage, 10,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)

  output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds
  output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32)

  # Set up background audio.
  background_crossover_ms = 500
  background_segment_duration_ms = (
      FLAGS.clip_duration_ms + background_crossover_ms)
  background_segment_duration_samples = int(
      (background_segment_duration_ms * FLAGS.sample_rate) / 1000)
  background_segment_stride_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  background_ramp_samples = int(
      ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000)

  # Mix the background audio into the main track.
  how_many_backgrounds = int(
      math.ceil(output_audio_sample_count / background_segment_stride_samples))
  for i in range(how_many_backgrounds):
    output_offset = int(i * background_segment_stride_samples)
    background_index = np.random.randint(len(audio_processor.background_data))
    background_samples = audio_processor.background_data[background_index]
    background_offset = np.random.randint(
        0, len(background_samples) - model_settings['desired_samples'])
    background_volume = np.random.uniform(0, FLAGS.background_volume)
    mix_in_audio_sample(output_audio, output_offset, background_samples,
                        background_offset, background_segment_duration_samples,
                        background_volume, background_ramp_samples,
                        background_ramp_samples)

  # Mix the words into the main track, noting their labels and positions.
  output_labels = []
  word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms
  word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000)
  clip_duration_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000)
  how_many_words = int(
      math.floor(output_audio_sample_count / word_stride_samples))
  all_test_data, all_test_labels = audio_processor.get_unprocessed_data(
      -1, model_settings, 'testing')
  for i in range(how_many_words):
    output_offset = (
        int(i * word_stride_samples) + np.random.randint(word_gap_samples))
    output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate
    is_unknown = np.random.randint(100) < FLAGS.unknown_percentage
    if is_unknown:
      wanted_label = input_data.UNKNOWN_WORD_LABEL
    else:
      wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)]
    test_data_start = np.random.randint(len(all_test_data))
    found_sample_data = None
    index_lookup = np.arange(len(all_test_data), dtype=np.int32)
    np.random.shuffle(index_lookup)
    for test_data_offset in range(len(all_test_data)):
      test_data_index = index_lookup[(
          test_data_start + test_data_offset) % len(all_test_data)]
      current_label = all_test_labels[test_data_index]
      if current_label == wanted_label:
        found_sample_data = all_test_data[test_data_index]
        break
    mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
                        clip_duration_samples, 1.0, 500, 500)
    output_labels.append({'label': wanted_label, 'time': output_offset_ms})

  input_data.save_wav_file(FLAGS.output_audio_file, output_audio,
                           FLAGS.sample_rate)
  tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file)

  with open(FLAGS.output_labels_file, 'w') as f:
    for output_label in output_labels:
      f.write('%s, %f\n' % (output_label['label'], output_label['time']))
  tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)
示例#18
0
def wav_to_features(sample_rate, clip_duration_ms, window_size_ms,
                    window_stride_ms, feature_bin_count, quantize, preprocess,
                    input_wav, output_c_file):
  """Converts an audio file into its corresponding feature map.

  Args:
    sample_rate: Expected sample rate of the wavs.
    clip_duration_ms: Expected duration in milliseconds of the wavs.
    window_size_ms: How long each spectrogram timeslice is.
    window_stride_ms: How far to move in time between spectrogram timeslices.
    feature_bin_count: How many bins to use for the feature fingerprint.
    quantize: Whether to train the model for eight-bit deployment.
    preprocess: Spectrogram processing mode; "mfcc", "average" or "micro".
    input_wav: Path to the audio WAV file to read.
    output_c_file: Where to save the generated C source file.
  """

  # Start a new TensorFlow session.
  sess = tf.compat.v1.InteractiveSession()

  model_settings = models.prepare_model_settings(
      0, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms,
      feature_bin_count, preprocess)
  audio_processor = input_data.AudioProcessor(None, None, 0, 0, '', 0, 0,
                                              model_settings, None)

  results = audio_processor.get_features_for_wav(input_wav, model_settings,
                                                 sess)
  features = results[0]

  variable_base = os.path.splitext(os.path.basename(input_wav).lower())[0]

  # Save a C source file containing the feature data as an array.
  with gfile.GFile(output_c_file, 'w') as f:
    f.write('/* File automatically created by\n')
    f.write(' * tensorflow/examples/speech_commands/wav_to_features.py \\\n')
    f.write(' * --sample_rate=%d \\\n' % sample_rate)
    f.write(' * --clip_duration_ms=%d \\\n' % clip_duration_ms)
    f.write(' * --window_size_ms=%d \\\n' % window_size_ms)
    f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms)
    f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count)
    if quantize:
      f.write(' * --quantize=1 \\\n')
    f.write(' * --preprocess="%s" \\\n' % preprocess)
    f.write(' * --input_wav="%s" \\\n' % input_wav)
    f.write(' * --output_c_file="%s" \\\n' % output_c_file)
    f.write(' */\n\n')
    f.write('const int g_%s_width = %d;\n' %
            (variable_base, model_settings['fingerprint_width']))
    f.write('const int g_%s_height = %d;\n' %
            (variable_base, model_settings['spectrogram_length']))
    if quantize:
      features_min, features_max = input_data.get_features_range(model_settings)
      f.write('const unsigned char g_%s_data[] = {' % variable_base)
      i = 0
      for value in features.flatten():
        quantized_value = int(
            round(
                (255 * (value - features_min)) / (features_max - features_min)))
        if quantized_value < 0:
          quantized_value = 0
        if quantized_value > 255:
          quantized_value = 255
        if i == 0:
          f.write('\n  ')
        f.write('%d, ' % (quantized_value))
        i = (i + 1) % 10
    else:
      f.write('const float g_%s_data[] = {\n' % variable_base)
      i = 0
      for value in features.flatten():
        if i == 0:
          f.write('\n  ')
        f.write(' ,%f' % value)
        i = (i + 1) % 10
    f.write('\n};\n')
示例#19
0
def main(_):
    # Set the verbosity based on flags (default is INFO, so we see all messages)
    tf.compat.v1.logging.set_verbosity(FLAGS.verbosity)

    # Start a new TensorFlow session.
    sess = tf.compat.v1.InteractiveSession()

    # Begin by making sure we have the training data we need. If you already have
    # training data of your own, use `--data_url= ` on the command line to avoid
    # downloading.
    model_settings = models.prepare_model_settings(
        len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
        FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
        FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess)
    audio_processor = input_data.AudioProcessor(
        FLAGS.data_url, FLAGS.data_dir,
        FLAGS.silence_percentage, FLAGS.unknown_percentage,
        FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
        FLAGS.testing_percentage, model_settings, FLAGS.summaries_dir)
    fingerprint_size = model_settings['fingerprint_size']
    label_count = model_settings['label_count']
    time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
    # Figure out the learning rates for each training phase. Since it's often
    # effective to have high learning rates at the start of training, followed by
    # lower levels towards the end, the number of steps and learning rates can be
    # specified as comma-separated lists to define the rate at each stage. For
    # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
    # will run 13,000 training loops in total, with a rate of 0.001 for the first
    # 10,000, and 0.0001 for the final 3,000.
    training_steps_list = list(
        map(int, FLAGS.how_many_training_steps.split(',')))
    learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
    if len(training_steps_list) != len(learning_rates_list):
        raise Exception(
            '--how_many_training_steps and --learning_rate must be equal length '
            'lists, but are %d and %d long instead' %
            (len(training_steps_list), len(learning_rates_list)))

    input_placeholder = tf.compat.v1.placeholder(tf.float32,
                                                 [None, fingerprint_size],
                                                 name='fingerprint_input')
    if FLAGS.quantize:
        fingerprint_min, fingerprint_max = input_data.get_features_range(
            model_settings)
        fingerprint_input = tf.quantization.fake_quant_with_min_max_args(
            input_placeholder, fingerprint_min, fingerprint_max)
    else:
        fingerprint_input = input_placeholder

    logits, dropout_prob = models.create_model(fingerprint_input,
                                               model_settings,
                                               FLAGS.model_architecture,
                                               is_training=True)

    # Define loss and optimizer
    ground_truth_input = tf.compat.v1.placeholder(tf.int64, [None],
                                                  name='groundtruth_input')

    # Optionally we can add runtime checks to spot when NaNs or other symptoms of
    # numerical errors start occurring during training.
    control_dependencies = []
    if FLAGS.check_nans:
        checks = tf.compat.v1.add_check_numerics_ops()
        control_dependencies = [checks]

    # Create the back propagation and training evaluation machinery in the graph.
    with tf.compat.v1.name_scope('cross_entropy'):
        cross_entropy_mean = tf.compat.v1.losses.sparse_softmax_cross_entropy(
            labels=ground_truth_input, logits=logits)

    if FLAGS.quantize:
        try:
            tf.contrib.quantize.create_training_graph(quant_delay=0)
        except AttributeError as e:
            msg = e.args[0]
            msg += (
                '\n\n The --quantize option still requires contrib, which is not '
                'part of TensorFlow 2.0. Please install a previous version:'
                '\n    `pip install tensorflow<=1.15`')
            e.args = (msg, )
            raise e

    with tf.compat.v1.name_scope('train'), tf.control_dependencies(
            control_dependencies):
        learning_rate_input = tf.compat.v1.placeholder(
            tf.float32, [], name='learning_rate_input')
        if FLAGS.optimizer == 'gradient_descent':
            train_step = tf.compat.v1.train.GradientDescentOptimizer(
                learning_rate_input).minimize(cross_entropy_mean)
        elif FLAGS.optimizer == 'momentum':
            train_step = tf.compat.v1.train.MomentumOptimizer(
                learning_rate_input, .9,
                use_nesterov=True).minimize(cross_entropy_mean)
        else:
            raise Exception('Invalid Optimizer')
    predicted_indices = tf.argmax(input=logits, axis=1)
    correct_prediction = tf.equal(predicted_indices, ground_truth_input)
    confusion_matrix = tf.math.confusion_matrix(labels=ground_truth_input,
                                                predictions=predicted_indices,
                                                num_classes=label_count)
    evaluation_step = tf.reduce_mean(
        input_tensor=tf.cast(correct_prediction, tf.float32))
    with tf.compat.v1.get_default_graph().name_scope('eval'):
        tf.compat.v1.summary.scalar('cross_entropy', cross_entropy_mean)
        tf.compat.v1.summary.scalar('accuracy', evaluation_step)

    global_step = tf.compat.v1.train.get_or_create_global_step()
    increment_global_step = tf.compat.v1.assign(global_step, global_step + 1)

    saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())

    # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
    merged_summaries = tf.compat.v1.summary.merge_all(scope='eval')
    train_writer = tf.compat.v1.summary.FileWriter(
        FLAGS.summaries_dir + '/train', sess.graph)
    validation_writer = tf.compat.v1.summary.FileWriter(FLAGS.summaries_dir +
                                                        '/validation')

    tf.compat.v1.global_variables_initializer().run()

    start_step = 1

    if FLAGS.start_checkpoint:
        models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
        start_step = global_step.eval(session=sess)

    tf.compat.v1.logging.info('Training from step: %d ', start_step)

    # Save graph.pbtxt.
    tf.io.write_graph(sess.graph_def, FLAGS.train_dir,
                      FLAGS.model_architecture + '.pbtxt')

    # Save list of words.
    with gfile.GFile(
            os.path.join(FLAGS.train_dir,
                         FLAGS.model_architecture + '_labels.txt'), 'w') as f:
        f.write('\n'.join(audio_processor.words_list))

    # Training loop.
    training_steps_max = np.sum(training_steps_list)
    for training_step in xrange(start_step, training_steps_max + 1):
        # Figure out what the current learning rate is.
        training_steps_sum = 0
        for i in range(len(training_steps_list)):
            training_steps_sum += training_steps_list[i]
            if training_step <= training_steps_sum:
                learning_rate_value = learning_rates_list[i]
                break
        # Pull the audio samples we'll use for training.
        train_fingerprints, train_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
            FLAGS.background_volume, time_shift_samples, 'training', sess)
        # Run the graph with this batch of training data.
        train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
            [
                merged_summaries,
                evaluation_step,
                cross_entropy_mean,
                train_step,
                increment_global_step,
            ],
            feed_dict={
                fingerprint_input: train_fingerprints,
                ground_truth_input: train_ground_truth,
                learning_rate_input: learning_rate_value,
                dropout_prob: 0.5
            })
        train_writer.add_summary(train_summary, training_step)
        tf.compat.v1.logging.info(
            'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
            (training_step, learning_rate_value, train_accuracy * 100,
             cross_entropy_value))
        is_last_step = (training_step == training_steps_max)
        if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
            set_size = audio_processor.set_size('validation')
            total_accuracy = 0
            total_conf_matrix = None
            for i in xrange(0, set_size, FLAGS.batch_size):
                validation_fingerprints, validation_ground_truth = (
                    audio_processor.get_data(FLAGS.batch_size, i,
                                             model_settings, 0.0, 0.0, 0,
                                             'validation', sess))
                # Run a validation step and capture training summaries for TensorBoard
                # with the `merged` op.
                validation_summary, validation_accuracy, conf_matrix = sess.run(
                    [merged_summaries, evaluation_step, confusion_matrix],
                    feed_dict={
                        fingerprint_input: validation_fingerprints,
                        ground_truth_input: validation_ground_truth,
                        dropout_prob: 1.0
                    })
                validation_writer.add_summary(validation_summary,
                                              training_step)
                batch_size = min(FLAGS.batch_size, set_size - i)
                total_accuracy += (validation_accuracy * batch_size) / set_size
                if total_conf_matrix is None:
                    total_conf_matrix = conf_matrix
                else:
                    total_conf_matrix += conf_matrix
            tf.compat.v1.logging.info('Confusion Matrix:\n %s' %
                                      (total_conf_matrix))
            tf.compat.v1.logging.info(
                'Step %d: Validation accuracy = %.1f%% (N=%d)' %
                (training_step, total_accuracy * 100, set_size))

        # Save the model checkpoint periodically.
        if (training_step % FLAGS.save_step_interval == 0
                or training_step == training_steps_max):
            checkpoint_path = os.path.join(FLAGS.train_dir,
                                           FLAGS.model_architecture + '.ckpt')
            tf.compat.v1.logging.info('Saving to "%s-%d"', checkpoint_path,
                                      training_step)
            saver.save(sess, checkpoint_path, global_step=training_step)

    set_size = audio_processor.set_size('testing')
    tf.compat.v1.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in xrange(0, set_size, FLAGS.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
                dropout_prob: 1.0
            })
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
    tf.compat.v1.logging.warn('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.compat.v1.logging.warn('Final test accuracy = %.1f%% (N=%d)' %
                              (total_accuracy * 100, set_size))
示例#20
0
def main(_):
    words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(','))
    model_settings = models.prepare_model_settings(len(words_list),
                                                   FLAGS.sample_rate,
                                                   FLAGS.clip_duration_ms,
                                                   FLAGS.window_size_ms,
                                                   FLAGS.window_stride_ms,
                                                   FLAGS.dct_coefficient_count)
    audio_processor = input_data.AudioProcessor('', FLAGS.data_dir,
                                                FLAGS.silence_percentage, 10,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)

    output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds
    output_audio = np.zeros((output_audio_sample_count, ), dtype=np.float32)

    # Set up background audio.
    background_crossover_ms = 500
    background_segment_duration_ms = (FLAGS.clip_duration_ms +
                                      background_crossover_ms)
    background_segment_duration_samples = int(
        (background_segment_duration_ms * FLAGS.sample_rate) / 1000)
    background_segment_stride_samples = int(
        (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
    background_ramp_samples = int(
        ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000)

    # Mix the background audio into the main track.
    how_many_backgrounds = int(
        math.ceil(output_audio_sample_count /
                  background_segment_stride_samples))
    for i in range(how_many_backgrounds):
        output_offset = int(i * background_segment_stride_samples)
        background_index = np.random.randint(
            len(audio_processor.background_data))
        background_samples = audio_processor.background_data[background_index]
        background_offset = np.random.randint(
            0,
            len(background_samples) - model_settings['desired_samples'])
        background_volume = np.random.uniform(0, FLAGS.background_volume)
        mix_in_audio_sample(output_audio, output_offset, background_samples,
                            background_offset,
                            background_segment_duration_samples,
                            background_volume, background_ramp_samples,
                            background_ramp_samples)

    # Mix the words into the main track, noting their labels and positions.
    output_labels = []
    word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms
    word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000)
    clip_duration_samples = int(
        (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
    word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000)
    how_many_words = int(
        math.floor(output_audio_sample_count / word_stride_samples))
    all_test_data, all_test_labels = audio_processor.get_unprocessed_data(
        -1, model_settings, 'testing')
    for i in range(how_many_words):
        output_offset = (int(i * word_stride_samples) +
                         np.random.randint(word_gap_samples))
        output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate
        is_unknown = np.random.randint(100) < FLAGS.unknown_percentage
        if is_unknown:
            wanted_label = input_data.UNKNOWN_WORD_LABEL
        else:
            wanted_label = words_list[2 +
                                      np.random.randint(len(words_list) - 2)]
        test_data_start = np.random.randint(len(all_test_data))
        found_sample_data = None
        index_lookup = np.arange(len(all_test_data), dtype=np.int32)
        np.random.shuffle(index_lookup)
        for test_data_offset in range(len(all_test_data)):
            test_data_index = index_lookup[(test_data_start + test_data_offset)
                                           % len(all_test_data)]
            current_label = all_test_labels[test_data_index]
            if current_label == wanted_label:
                found_sample_data = all_test_data[test_data_index]
                break
        mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
                            clip_duration_samples, 1.0, 500, 500)
        output_labels.append({'label': wanted_label, 'time': output_offset_ms})

    input_data.save_wav_file(FLAGS.output_audio_file, output_audio,
                             FLAGS.sample_rate)
    tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file)

    with open(FLAGS.output_labels_file, 'w') as f:
        for output_label in output_labels:
            f.write('%s, %f\n' % (output_label['label'], output_label['time']))
    tf.logging.info('Saved streaming test labels to %s',
                    FLAGS.output_labels_file)