Пример #1
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data_filler.prepare_words_list_my(
        wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
def main(_):
  num = 0
  words_list = input_data_filler.prepare_words_list_my(FLAGS.wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms,
      FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
  audio_processor = input_data_filler.AudioProcessor(
      '', FLAGS.data_dir, FLAGS.silence_percentage, 10,               #unknown %
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)

  output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds  #test_duration_secongd
  output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32)      #600s

  # Set up background audio.
  background_crossover_ms = 500
  background_segment_duration_ms = (
      FLAGS.clip_duration_ms + background_crossover_ms)
  background_segment_duration_samples = int(
      (background_segment_duration_ms * FLAGS.sample_rate) / 1000)
  background_segment_stride_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  background_ramp_samples = int(
      ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000) #在1/2background crossover ms处音量变化

  # Mix the background audio into the main track.
  how_many_backgrounds = int(
      math.ceil(output_audio_sample_count / background_segment_stride_samples))
  for i in range(how_many_backgrounds):
    output_offset = int(i * background_segment_stride_samples)
    background_index = np.random.randint(len(audio_processor.background_data))
    background_samples = audio_processor.background_data[background_index]
    background_offset = np.random.randint(
        0, len(background_samples) - model_settings['desired_samples'])
    background_volume = np.random.uniform(0, FLAGS.background_volume)
    mix_in_audio_sample(output_audio, output_offset, background_samples,
                        background_offset, background_segment_duration_samples,
                        background_volume, background_ramp_samples,
                        background_ramp_samples)
  #mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset,
   #                     clip_duration, sample_volume, ramp_in, ramp_out)
  # Mix the words into the main track, noting their labels and positions.
  output_labels = []
  word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms
  word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000)
  clip_duration_samples = int(
      (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000)
  word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000)
  how_many_words = int(
      math.floor(output_audio_sample_count / word_stride_samples))
  all_test_data, all_test_labels = audio_processor.get_unprocessed_data(
      -1, model_settings, 'testing')
  for i in range(how_many_words):
    output_offset = (
        int(i * word_stride_samples) + np.random.randint(word_gap_samples))
    #output_offset = (
    #    int(i * word_stride_samples))
    output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate
    is_unknown = np.random.randint(100) < FLAGS.unknown_percentage
    if is_unknown:
       wanted_label = input_data_filler.UNKNOWN_WORD_LABEL
       #wanted_label = 'unknown'
       num = num+1
       print("is unknown " + str(num))
    else:
      wanted_label = words_list[1 + np.random.randint(len(words_list) - 1)]
      #wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)]
    test_data_start = np.random.randint(len(all_test_data))
    found_sample_data = None
    index_lookup = np.arange(len(all_test_data), dtype=np.int32)
    np.random.shuffle(index_lookup)
    for test_data_offset in range(len(all_test_data)):
      test_data_index = index_lookup[(
          test_data_start + test_data_offset) % len(all_test_data)]
      current_label = all_test_labels[test_data_index]
      if current_label == wanted_label:
        found_sample_data = all_test_data[test_data_index]
        break
        # mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset,
        #                     clip_duration, sample_volume, ramp_in, ramp_out)
    mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
                        clip_duration_samples, 1.0, 325, 325)
        #mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0,
        #                clip_duration_samples, 1.0, 5, 5)
    #if not is_unknown:
    #   output_labels.append({'label': wanted_label, 'time': output_offset_ms})

  input_data_filler.save_wav_file(FLAGS.output_audio_file, output_audio,
                           FLAGS.sample_rate)
  tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file)

  with open(FLAGS.output_labels_file, 'w') as f:
    for output_label in output_labels:
      f.write('%s, %f\n' % (output_label['label'], output_label['time']))
  tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)
Пример #3
0
def main(_):
  best_acc = 0
  best_step = 0
  best_acc_istrain = 0
  best_step_istrain = 0
  # We want to see all the logging messages for this tutorial.
  tf.logging.set_verbosity(tf.logging.INFO)

  # Start a new TensorFlow session.
  sess = tf.InteractiveSession()

  # Begin by making sure we have the training data we need. If you already have
  # training data of your own, use `--data_url= ` on the command line to avoid
  # downloading.
  model_settings = models.prepare_model_settings(
      len(input_data_filler.prepare_words_list_my(FLAGS.wanted_words.split(','))),
      FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
      FLAGS.window_stride_ms, FLAGS.dct_coefficient_count)
  audio_processor = input_data_filler.AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
      FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)
  fingerprint_size = model_settings['fingerprint_size']
  label_count = model_settings['label_count']
  time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
  # Figure out the learning rates for each training phase. Since it's often
  # effective to have high learning rates at the start of training, followed by
  # lower levels towards the end, the number of steps and learning rates can be
  # specified as comma-separated lists to define the rate at each stage. For
  # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
  # will run 13,000 training loops in total, with a rate of 0.001 for the first
  # 10,000, and 0.0001 for the final 3,000.
  training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
  learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
  if len(training_steps_list) != len(learning_rates_list):
    raise Exception(
        '--how_many_training_steps and --learning_rate must be equal length '
        'lists, but are %d and %d long instead' % (len(training_steps_list),
                                                   len(learning_rates_list)))
##############################################
  ############tensorflow modules##########

  fingerprint_input = tf.placeholder(
      tf.float32, [None, fingerprint_size], name='fingerprint_input')

  # ############ 模型创建 ##########
  istrain = tf.placeholder(tf.bool, name='istrain')
  logits= models.create_model(
      fingerprint_input,
      model_settings,
      FLAGS.model_architecture,
      is_training=istrain)
  ############ 模型创建 ##########
  # logits, dropout_prob= models.create_model(
  #     fingerprint_input,
  #     model_settings,
  #     FLAGS.model_architecture,
  #     is_training=True)
  # Define loss and optimizer

  ############ 真实值 ##########
  ground_truth_input = tf.placeholder(
      tf.float32, [None, label_count], name='groundtruth_input')

  # Optionally we can add runtime checks to spot when NaNs or other symptoms of
  # numerical errors start occurring during training.
  control_dependencies = []
  if FLAGS.check_nans:
    checks = tf.add_check_numerics_ops()
    control_dependencies = [checks]

  # Create the back propagation and training evaluation machinery in the graph.
  ############ 交叉熵计算 ##########
  # with tf.name_scope('cross_entropy'):
  #   cross_entropy_mean = tf.reduce_mean(
  #       tf.nn.softmax_cross_entropy_with_logits(
  #           labels=ground_truth_input, logits=logits)) + beta*loss_norm
  with tf.name_scope('cross_entropy'):
    cross_entropy_mean = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(
            labels=ground_truth_input, logits=logits))
  tf.summary.scalar('cross_entropy', cross_entropy_mean)

  ############ 学习率、准确率、混淆矩阵 ##########
  # learning_rate_input    学习率输入(tf.placeholder)
  # train_step             训练过程 (优化器)
  # predicted_indices      预测输出索引
  # expected_indices       实际希望输出索引
  # correct_prediction     正确预测矩阵
  # confusion_matrix       混淆矩阵
  # evaluation_step        正确分类概率(每个阶段)
  # global_step            全局训练阶段
  # increment_global_step  全局训练阶段递增

  learning_rate_input = tf.placeholder(
      tf.float32, [], name='learning_rate_input')
  update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
  with tf.control_dependencies(update_ops):
    train_step = tf.train.AdamOptimizer(
        learning_rate_input).minimize(cross_entropy_mean)
  # with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
  #   learning_rate_input = tf.placeholder(
  #       tf.float32, [], name='learning_rate_input')
  #  # train_step = tf.train.GradientDescentOptimizer(
  #     #  learning_rate_input).minimize(cross_entropy_mean)
  #   with tf.control_dependencies(update_ops):
  #       train_step = tf.train.AdamOptimizer(
  #           learning_rate_input).minimize(cross_entropy_mean)
  predicted_indices = tf.argmax(logits, 1)
  expected_indices = tf.argmax(ground_truth_input, 1)
  correct_prediction = tf.equal(predicted_indices, expected_indices)
  confusion_matrix = tf.confusion_matrix(
      expected_indices, predicted_indices, num_classes=label_count)
  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  acc = tf.summary.scalar('accuracy', evaluation_step)

  global_step = tf.train.get_or_create_global_step()
  increment_global_step = tf.assign(global_step, global_step + 1)


  saver = tf.train.Saver(tf.global_variables(),max_to_keep=None)# max keep file // moren 5

  # Merge all the summaries and write them out to /tmp/retrain_logs (by default)
  merged_summaries = tf.summary.merge_all()
  validation_merged_summaries = tf.summary.merge([tf.get_collection(tf.GraphKeys.SUMMARIES,'accuracy'),tf.get_collection(tf.GraphKeys.SUMMARIES,'cross_entropy')])
  test_summaries = tf.summary.merge([acc])
  test_summaries_istrain = tf.summary.merge([acc])
  train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                       sess.graph)
  validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation')
  test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test')
  test_istrain_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test_istrain')
  tf.global_variables_initializer().run()

  start_step = 1

  if FLAGS.start_checkpoint:
    models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
    start_step = global_step.eval(session=sess)

  tf.logging.info('Training from step: %d ', start_step)

  # Save graph.pbtxt.
  tf.train.write_graph(sess.graph_def, FLAGS.train_dir,
                       FLAGS.model_architecture + '.pbtxt')

  # Save list of words.
  with gfile.GFile(
      os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'),
      'w') as f:
    f.write('\n'.join(audio_processor.words_list))
###
  # model1: fc
  # model2: conv :940k个parameter
  # model3:low_latancy_conv:~~model1
  # model4: 750k
  # Training loop.
    #############################################
    ########            主循环              ######
    #############################################
  training_steps_max = np.sum(training_steps_list)
  for training_step in xrange(start_step, training_steps_max + 1):
    # Figure out what the current learning rate is.
    #######       自动切换学习率      #######
    if training_step <12000+1:
        learning_rate_value = learning_rates_list[0]*0.02**(training_step/12000)
    else:
        learning_rate_value = learning_rates_list[0]*0.02    #0.015 12000
    training_steps_sum = 0
    # for i in range(len(training_steps_list)):
    #   training_steps_sum += training_steps_list[i]
    #   if training_step <= training_steps_sum:
    #     learning_rate_value = learning_rates_list[i]
    #     break

    # Pull the audio samples we'll use for training.
    #######       audio处理器导入数据      ##################################
    ##get_data(self, how_many, offset, model_settings, background_frequency,
    ##         background_volume_range, time_shift, mode, sess)
    ########################################################################
    train_fingerprints, train_ground_truth = audio_processor.get_data_my(
        FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
        FLAGS.background_volume, time_shift_samples, 'training', sess)
    #mid = np.abs(np.max(train_fingerprints) + np.min(train_fingerprints)) / 2
    #half = np.max(train_fingerprints) - np.min(train_fingerprints)
    #train_fingerprints = ((train_fingerprints + mid) / half * 255).astype(int)
    ####    输入归一化   ####
    # train_fingerprints=input_normalization(train_fingerprints)
    # Run the graph with this batch of training data.
    train_fingerprints = np.round(train_fingerprints)
    train_fingerprints = np.clip(train_fingerprints, -100, 100)

    train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
        [
            merged_summaries, evaluation_step, cross_entropy_mean, train_step,
            increment_global_step
        ],
        feed_dict={
            fingerprint_input: train_fingerprints,
            ground_truth_input: train_ground_truth,
            learning_rate_input: learning_rate_value,
            istrain:True
        })
    train_writer.add_summary(train_summary, training_step)
    tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
                    (training_step, learning_rate_value, train_accuracy * 100,
                     cross_entropy_value))
    is_last_step = (training_step == training_steps_max)
    if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
      set_size = audio_processor.set_size('validation')
      total_accuracy = 0
      total_conf_matrix = None
      #############################################
      ########交叉验证集重复计算正确率和混淆矩阵######
      for i in xrange(0, set_size, FLAGS.batch_size):
        validation_fingerprints, validation_ground_truth = (
            audio_processor.get_data_my(FLAGS.batch_size, i, model_settings, 0.0,
                                     0.0, 0, 'validation', sess))
        #mid = np.abs(np.max(validation_fingerprints) + np.min(validation_fingerprints)) / 2
       # half = np.max(validation_fingerprints) - np.min(validation_fingerprints)
        #validation_fingerprints = ((validation_fingerprints + mid) / half * 255).astype(int)
        # ####    输入归一化   ####
        # validation_fingerprints = input_normalization(validation_fingerprints)
        # Run a validation step and capture training summaries for TensorBoard
        # with the `merged` op.
        validation_fingerprints = np.round(validation_fingerprints)
        validation_fingerprints = np.clip(validation_fingerprints,-100,100)
        validation_summaries, validation_accuracy, conf_matrix = sess.run(
            [validation_merged_summaries, evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: validation_fingerprints,
                ground_truth_input: validation_ground_truth,
                istrain: True
            })
        validation_writer.add_summary(validation_summaries, training_step)
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (validation_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
          total_conf_matrix = conf_matrix
        else:
          total_conf_matrix += conf_matrix

      tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
      tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
                      (training_step, total_accuracy * 100, set_size))

      #############################################
      ########  测试集重复计算正确率和混淆矩阵  ######
      set_size = audio_processor.set_size('testing')
      tf.logging.info('set_size=%d', set_size)
      test_fingerprints, test_ground_truth = audio_processor.get_data_my(
        -1, 0, model_settings, 0.0, 0.0, 0, 'testing', sess)
      #mid = np.abs(np.max(test_fingerprints) + np.min(test_fingerprints)) / 2
      #half = np.max(test_fingerprints) - np.min(test_fingerprints)
      #test_fingerprints = ((test_fingerprints + mid) / half * 255).astype(int)
      test_fingerprints = np.round(test_fingerprints)
      test_fingerprints = np.clip(test_fingerprints, -100, 100)

      final_summary,test_accuracy, conf_matrix = sess.run(
          [test_summaries,evaluation_step, confusion_matrix],
          feed_dict={
              fingerprint_input: test_fingerprints,
              ground_truth_input: test_ground_truth,
              istrain : False
          })
      final_summary_istrain,test_accuracy_istrain= sess.run(
          [test_summaries_istrain,evaluation_step],
          feed_dict={
              fingerprint_input: test_fingerprints,
              ground_truth_input: test_ground_truth,
              istrain : True
          })
      if test_accuracy > best_acc:
          best_acc = test_accuracy
          best_step = training_step
      if test_accuracy_istrain > best_acc_istrain:
          best_acc_istrain = test_accuracy_istrain
          best_step_istrain = training_step
      test_writer.add_summary(final_summary, training_step)
      test_istrain_writer.add_summary(final_summary_istrain, training_step)
      tf.logging.info('Confusion Matrix:\n %s' % (conf_matrix))
      tf.logging.info('test accuracy = %.1f%% (N=%d)' % (test_accuracy * 100,6882))
      tf.logging.info('test_istrain accuracy = %.1f%% (N=%d)' % (test_accuracy_istrain * 100,6882))

      tf.logging.info('Best test accuracy before now = %.1f%% (N=%d)' % (best_acc * 100,6882) + '  at step of ' + str(best_step))
      tf.logging.info('Best test_istrain accuracy before now = %.1f%% (N=%d)' % (best_acc_istrain * 100,6882) + '  at step of ' + str(best_step_istrain))
    # Save the model checkpoint periodically.
    if (training_step % FLAGS.save_step_interval == 0 or
        training_step == training_steps_max):
      checkpoint_path = os.path.join(FLAGS.train_dir + '/'+FLAGS.model_architecture,
                                     FLAGS.model_architecture + '.ckpt')
      tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step)
      saver.save(sess, checkpoint_path, global_step=training_step)
    print_line = 'Best test accuracy before now = %.1f%% (N=%d)' % (best_acc * 100,6882) + '  at step of ' + str(best_step) + '\n' + \
                 'Best test_istrain accuracy before now = %.1f%% (N=%d)' % (best_acc_istrain * 100,6882) + '  at step of ' + str(best_step_istrain)
    if training_step == training_steps_max:
        with open(FLAGS.train_dir + '/' +FLAGS.model_architecture+ '/details.txt', 'w') as f:
            f.write(print_line)
Пример #4
0
window_stride_ms = 20
dct_coefficient_count = 10

pbs_path = 'tmp/pbs/l_ds_conv5000.pb'




# We want to see all the logging messages for this tutorial.
tf.logging.set_verbosity(tf.logging.INFO)

# Start a new TensorFlow session.
sess = tf.InteractiveSession()

model_settings = models.prepare_model_settings(
  len(input_data_filler.prepare_words_list_my(wanted_words.split(','))),
  sample_rate, clip_duration_ms, window_size_ms,
  window_stride_ms, dct_coefficient_count)
audio_processor = input_data_filler.AudioProcessor(
  data_url, data_dir, silence_percentage,
  unknown_percentage,
  wanted_words.split(','), validation_percentage,
  testing_percentage, model_settings)
time_shift_samples = int((time_shift_ms * sample_rate) / 1000)

fingerprint_size = model_settings['fingerprint_size']
label_count = model_settings['label_count']
#*********************************************************************
print(" *****************  audio processor  ********************")

training_datas = len(audio_processor.data_index['training']) + len(audio_processor.unknown_index['training'])