def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape(fingerprint_input, [ -1, fingerprint_time_size * fingerprint_frequency_size ]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def testCreateModelFullyConnectedTraining(self): model_settings = models.prepare_model_settings(10, 16000, 1000, 20, 10, 40) with self.test_session() as sess: fingerprint_input = tf.zeros( [1, model_settings["fingerprint_size"]]) logits, dropout_prob = models.create_model(fingerprint_input, model_settings, "single_fc", True) self.assertIsNotNone(logits) self.assertIsNotNone(dropout_prob) self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name)) self.assertIsNotNone( sess.graph.get_tensor_by_name(dropout_prob.name))
def main(): model_settings = models.prepare_model_settings( len(data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = data.AudioProcessor( data_url=FLAGS.data_url, data_dir=FLAGS.data_dir, silence_percentage=FLAGS.silence_percentage, unknown_percentage=FLAGS.unknown_percentage, wanted_words=FLAGS.wanted_words.split(','), validation_percentage=FLAGS.validation_percentage, testing_percentage=FLAGS.testing_percentage, model_settings=model_settings) tflite_test(model_settings, audio_processor, FLAGS.tflite_path)
def get_weight(): model_settings = models.prepare_model_settings( len(data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) model = models.create_model(model_settings, FLAGS.model_architecture, FLAGS.model_size_info) print(len(data.prepare_words_list(FLAGS.wanted_words.split(','))), data.prepare_words_list(FLAGS.wanted_words.split(','))) model.load_weights(FLAGS.checkpoint).expect_partial() model.summary() model_weights = model.get_weights() arr = np.array(model_weights) #np.set_printoptions(threshold=sys.maxsize) #np.set_printoptions(precision=14, suppress=True) write_txt(arr, FLAGS.output_file)
def get_set(set_type): wanted_words = 'yes,no,up,down,left,right,on,off,stop,go' sample_rate = 16000 clip_duration_ms = 1000 window_size_ms = 30.0 window_stride_ms = 10.0 dct_coefficient_count = 40 data_url = '' data_dir = '/tmp/speech_dataset/' silence_percentage = 0 unknown_percentage = 0 validation_percentage = 1 testing_percentage = 1 # Start a new TensorFlow session. sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(wanted_words.split(','))), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) audio_processor = input_data.AudioProcessor( data_url, data_dir, silence_percentage, unknown_percentage, wanted_words.split(','), validation_percentage, testing_percentage, model_settings) data, labels = audio_processor.get_unprocessed_data(-1, model_settings, 'testing') # print('CREATE ANNOTATION SUBSET: Printing data then labels.') # print(data) # print(labels) size = audio_processor.set_size(set_type) print('CREATE ANNOTATION SUBSET: Printing annotation set size') print(size) annotation_listing = audio_processor.data_index[set_type] print('CREATE ANNOTATION SUBSET: Printing annotation set names') print(annotation_listing) return annotation_listing
def main(_): tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) print FLAGS.data_url print FLAGS.data_dir print model_settings
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): graph = tf.Graph() with graph.as_default(): words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax') return graph
def __init__(self, enable_function): self.data_dir = FLAGS.data_dir self.check_dir = "./result/ck" self.model_settings = models.prepare_model_settings( len(data_process.prepare_words_list(FLAGS.wanted_words.split(","))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_strides_ms, FLAGS.dct_coefficient_count) self.model = models.create_model(self.model_settings, FLAGS.model_architecture, FLAGS.model_size_info) self.audio_processor = data_process.AudioProcessor(data_dir=self.data_dir, silence_percentage=FLAGS.silence_percentage, unknown_percentage=FLAGS.unknown_percentage, wanted_words=FLAGS.wanted_words.split(","), model_settings=self.model_settings) # decay learning rate in a constant piecewise way training_steps_list = list(map(int, FLAGS.how_many_train_steps.split(","))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(","))) lr_boundary_list = training_steps_list[:-1] # only need values at which to change lr lr_schedule = tf.keras.optimizers.schedules.PiecewiseConstantDecay(boundaries=lr_boundary_list, values=learning_rates_list) # specify optimizer self.optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule) self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, models=self.model) # define loss self.loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) self.enable_function = enable_function # calculate epochs train_max_steps = np.sum(training_steps_list) self.epochs = int(np.ceil(train_max_steps / FLAGS.eval_step_interval))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() model_settings = models.prepare_model_settings( len( input_data_prediction.prepare_words_list( FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count, FLAGS.num_layers, FLAGS.num_units, FLAGS.use_attn, FLAGS.attn_size) audio_processor = input_data_prediction.AudioProcessor( '/home/guillaume/speech_dataset/test/audio', model_settings) window_size_ms = str(int(FLAGS.window_size_ms)) window_stride_ms = str(int(FLAGS.window_stride_ms)) dct_coefficient_count = str(int(FLAGS.dct_coefficient_count)) print('\n\npreprocessing audio files') print('fingerprint_size: ', model_settings['fingerprint_size']) print('window_size_ms: ', window_size_ms) print('window_stride_ms: ', window_stride_ms) print('dct_coefficient_count: ', dct_coefficient_count) dataset = audio_processor.get_data(model_settings, sess) save_dir = '/home/guillaume/speech_dataset/test/numpy/' np.save( save_dir + 'test_dataset_wsize' + str(window_size_ms) + '_wstride' + window_stride_ms + '_dct' + dct_coefficient_count + '_.npy', dataset) filenames = np.array( [x.split('/')[-1] for x in audio_processor.testing_data]) np.save( save_dir + 'filenames_wsize' + str(window_size_ms) + '_wstride' + window_stride_ms + '_dct' + dct_coefficient_count + '_.npy', filenames)
def run_inference(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info): tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) # test set set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) test_fingerprints, test_ground_truth = audio_processor.get_data( set_size, 0, model_settings, 0.0, 0.0, 0, 'testing', sess, debugging=True, wav_path="speech_dataset\\up\\0a2b400e_nohash_0.wav") #for ii in range(set_size): # np.savetxt('test_data/'+str(ii)+'.txt',test_fingerprints[ii], newline=' ', header=str(np.argmax(test_ground_truth[ii]))) print(test_fingerprints)
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) fingerprint_size = model_settings['fingerprint_size'] print(fingerprint_size)
def main(): model_settings = models.prepare_model_settings( len(data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = data.AudioProcessor( data_url=FLAGS.data_url, data_dir=FLAGS.data_dir, silence_percentage=FLAGS.silence_percentage, unknown_percentage=FLAGS.unknown_percentage, wanted_words=FLAGS.wanted_words.split(','), validation_percentage=FLAGS.validation_percentage, testing_percentage=FLAGS.testing_percentage, model_settings=model_settings) tflite_path = f'{FLAGS.model_architecture}_quantized.tflite' # Load floating point model from checkpoint and quantize it. quantize(model_settings, audio_processor, FLAGS.checkpoint, tflite_path) # Test the newly quantized model on the test set. tflite_test(model_settings, audio_processor, tflite_path)
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ model_settings = models.prepare_model_settings(2, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} input_frequency_size = model_settings['dct_coefficient_count'] input_time_size = model_settings['spectrogram_length'] fingerprint_input = tf.placeholder( tf.float32, [None, input_time_size, input_frequency_size, 1], name='fingerprint_4d') logits = models.create_model( fingerprint_input, model_settings, model_architecture, model_size_info, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def get_dataset(num_of_samples): import input_data import models wanted_words = 'yes,no,up,down,left,right,on,off,stop,go' model_settings = models.prepare_model_settings( label_count=len(input_data.prepare_words_list(wanted_words.split(','))), sample_rate=16000, clip_duration_ms=1000, window_size_ms=40.0, window_stride_ms=20.0, dct_coefficient_count=10 ) audio_processor = input_data.AudioProcessor( data_url='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz', data_dir='/tmp/speech_dataset/', silence_percentage=10.0, unknown_percentage=10.0, wanted_words=wanted_words.split(','), validation_percentage=10, testing_percentage=10, model_settings=model_settings ) print(audio_processor) set_size = audio_processor.set_size('testing') batch_size = num_of_samples sess = tf.InteractiveSession() tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None data, label = audio_processor.get_data( batch_size, 0, model_settings, 0.0, 0.0, 0, 'testing', sess) return data, label
def wav_to_features(sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, quantize, preprocess, input_wav, output_c_file): """Converts an audio file into its corresponding feature map. Args: sample_rate: Expected sample rate of the wavs. clip_duration_ms: Expected duration in milliseconds of the wavs. window_size_ms: How long each spectrogram timeslice is. window_stride_ms: How far to move in time between spectogram timeslices. feature_bin_count: How many bins to use for the feature fingerprint. quantize: Whether to train the model for eight-bit deployment. preprocess: Spectrogram processing mode; "mfcc", "average" or "micro". input_wav: Path to the audio WAV file to read. output_c_file: Where to save the generated C source file. """ # Start a new TensorFlow session. sess = tf.compat.v1.InteractiveSession() model_settings = models.prepare_model_settings( 0, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) audio_processor = input_data.AudioProcessor(None, None, 0, 0, '', 0, 0, model_settings, None) results = audio_processor.get_features_for_wav(input_wav, model_settings, sess) features = results[0] variable_base = os.path.splitext(os.path.basename(input_wav).lower())[0] # Save a C source file containing the feature data as an array. with gfile.GFile(output_c_file, 'w') as f: f.write('/* File automatically created by\n') f.write( ' * tensorflow/examples/speech_commands/wav_to_features.py \\\n') f.write(' * --sample_rate=%d \\\n' % sample_rate) f.write(' * --clip_duration_ms=%d \\\n' % clip_duration_ms) f.write(' * --window_size_ms=%d \\\n' % window_size_ms) f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms) f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count) if quantize: f.write(' * --quantize=1 \\\n') f.write(' * --preprocess="%s" \\\n' % preprocess) f.write(' * --input_wav="%s" \\\n' % input_wav) f.write(' * --output_c_file="%s" \\\n' % output_c_file) f.write(' */\n\n') f.write('const int g_%s_width = %d;\n' % (variable_base, model_settings['fingerprint_width'])) f.write('const int g_%s_height = %d;\n' % (variable_base, model_settings['spectrogram_length'])) if quantize: features_min, features_max = input_data.get_features_range( model_settings) f.write('const unsigned char g_%s_data[] = {' % variable_base) i = 0 for value in features.flatten(): quantized_value = int( round((255 * (value - features_min)) / (features_max - features_min))) if quantized_value < 0: quantized_value = 0 if quantized_value > 255: quantized_value = 255 if i == 0: f.write('\n ') f.write('%d, ' % (quantized_value)) i = (i + 1) % 10 else: f.write('const float g_%s_data[] = {\n' % variable_base) i = 0 for value in features.flatten(): if i == 0: f.write('\n ') f.write(' ,%f' % value) i = (i + 1) % 10 f.write('\n};\n')
def fold_batch_norm(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info, checkpoint, include_silence=True, lower_frequency_limit=20, upper_frequency_limit=4000, filterbank_channel_count=40): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ tf.reset_default_graph() tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(','), include_silence) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, lower_frequency_limit, upper_frequency_limit, filterbank_channel_count) fingerprint_input = tf.placeholder( tf.float32, [None, model_settings['fingerprint_size']], name='fingerprint_input') logits = models.create_model(fingerprint_input, model_settings, model_architecture, model_size_info, is_training=False) ground_truth_input = tf.placeholder(tf.float32, [None, model_settings['label_count']], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, checkpoint) saver = tf.train.Saver(tf.global_variables()) tf.logging.info( 'Folding batch normalization layer parameters to preceding layer weights/biases' ) # epsilon added to variance to avoid division by zero epsilon = 1e-3 # default epsilon for tf.slim.batch_norm # get batch_norm mean mean_variables = [ v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if 'moving_mean' in v.name ] for mean_var in mean_variables: mean_name = mean_var.name mean_values = sess.run(mean_var) variance_name = mean_name.replace('moving_mean', 'moving_variance') variance_var = [ v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if v.name == variance_name ][0] variance_values = sess.run(variance_var) beta_name = mean_name.replace('moving_mean', 'beta') beta_var = [ v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if v.name == beta_name ][0] beta_values = sess.run(beta_var) bias_name = mean_name.replace('batch_norm/moving_mean', 'biases') bias_var = [ v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if v.name == bias_name ][0] bias_values = sess.run(bias_var) wt_name = mean_name.replace('batch_norm/moving_mean:0', '') wt_var = \ [v for v in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) if (wt_name in v.name and 'weights' in v.name)][0] wt_values = sess.run(wt_var) wt_name = wt_var.name # Update weights tf.logging.info('Updating ' + wt_name) for l in range(wt_values.shape[3]): for k in range(wt_values.shape[2]): for j in range(wt_values.shape[1]): for i in range(wt_values.shape[0]): if "depthwise" in wt_name: # depthwise batchnorm params are ordered differently wt_values[i][j][k][l] *= 1.0 / np.sqrt( variance_values[k] + epsilon) # gamma (scale factor) is 1.0 else: wt_values[i][j][k][l] *= 1.0 / np.sqrt( variance_values[l] + epsilon) # gamma (scale factor) is 1.0 wt_values = sess.run(tf.assign(wt_var, wt_values)) # Update biases tf.logging.info('Updating ' + bias_name) if "depthwise" in wt_name: depth_dim = wt_values.shape[2] else: depth_dim = wt_values.shape[3] for l in range(depth_dim): bias_values[l] = (1.0 * (bias_values[l] - mean_values[l]) / np.sqrt(variance_values[l] + epsilon)) + \ beta_values[l] bias_values = sess.run(tf.assign(bias_var, bias_values)) # Write fused weights to ckpt file tf.logging.info('Saving new checkpoint at ' + checkpoint + '_bnfused') saver.save(sess, checkpoint + '_bnfused') tf.reset_default_graph() sess.close()
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) print(FLAGS.sample_rate) print(FLAGS.clip_duration_ms) print(FLAGS.window_size_ms) print(FLAGS.window_stride_ms) print(FLAGS.dct_coefficient_count) # get a set of decoded audio waves (in PCM format) from dataset train_fingerprints_unproc, train_ground_truth_unproc = audio_processor.get_unprocessed_data( 2, model_settings , 'training') print(train_fingerprints_unproc[1:2,:]) # f = open("wave.txt","w") # np.savetxt("wave.txt",train_fingerprints_unproc[1], delimiter=",") # f.close() # return # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) fingerprint_input = tf.placeholder( tf.float32, [None, fingerprint_size], name='fingerprint_input') logits, dropout_prob , max_pool_value, first_conv_val, second_conv_val, first_bias_val,first_weights_val,second_bias_val,second_weights_val,final_fc_bias_val, final_fc_weights_val = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer ground_truth_input = tf.placeholder( tf.float32, [None, label_count], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=ground_truth_input, logits=logits)) tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): learning_rate_input = tf.placeholder( tf.float32, [], name='learning_rate_input') train_step = tf.train.GradientDescentOptimizer( learning_rate_input).minimize(cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.contrib.framework.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph.as_graph_def(add_shapes=True), FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) #Initialize the max of output conv2d tensors to zero max2_conv1=0 max2_conv2=0 maxF=open("max.log","w") # Training loop. training_steps_max = np.sum(training_steps_list) # !!!!!!!!!!!!! bypass training to generate layer output if FLAGS.save_layers : training_steps_max = -1 for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, maxpool_summary,first_conv_max,second_conv_max, first_bias_max, first_weights_max, second_bias_max, second_weights_max, final_fc_bias_max, final_fc_weights_max , _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, max_pool_value, first_conv_val, second_conv_val, first_bias_val, first_weights_val, second_bias_val, second_weights_val, final_fc_bias_val, final_fc_weights_val, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, dropout_prob: 0.5 }) # if (training_step == start_step): # for i in range(0,39): # print ("********** printing file " + "maxpool" + str(i) + ".txt") # np.savetxt("maxpool" + str(i) + ".txt",maxpool_summary[0,i,:,:]) # print("*********************") # Just keep the max of max_conv1 and max_conv2 max2_conv1=max(max2_conv1,first_conv_max) max2_conv2=max(max2_conv2,second_conv_max) train_writer.add_summary(train_summary, training_step) tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, conf_matrix = sess.run( [merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) #Save the bias & weights tensors maximums in a file max_bias1=first_bias_max max_weights1=first_weights_max max_bias2=second_bias_max max_weights2=second_weights_max max_fc_bias=final_fc_bias_max max_fc_weights=final_fc_weights_max if (training_step == training_steps_max): maxF.write(str(max_bias1) + " \n") maxF.write(str(max_weights1) + " \n") maxF.write(str(max_bias2)+ " \n") maxF.write(str(max_weights2)+ " \n") maxF.write(str(max_fc_bias)+ " \n") maxF.write(str(max_fc_weights)+ " \n") # End of training loop #Now save the conv2d outputs tensors maxs and close the file maxF.write(str(max2_conv1) + " \n") maxF.write(str(max2_conv2)+ " \n") maxF.close() set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None if FLAGS.save_layers : set_size = 1 print("set_size", set_size) for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess, FLAGS.save_layers) outfc,test_accuracy, conf_matrix = sess.run( [logits,evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if FLAGS.save_layers : np.savetxt(os.path.join("./data", "outFC_{}.txt".format(i)),outfc, delimiter=",") if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) test_fingerprints = np.load('../../speech_dataset/test/numpy/test_dataset_wsize50_wstride10_dct40_.npy') filenames = np.load('../../speech_dataset/test/numpy/filenames_wsize50_wstride10_dct40_.npy') assert len(test_fingerprints) == len(filenames) print('test_fingerprints: ', test_fingerprints.shape) # Start a new TensorFlow session. sess = tf.InteractiveSession() model_settings = models.prepare_model_settings( len(input_data_prediction.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count, FLAGS.num_layers, FLAGS.num_units, FLAGS.use_attn, FLAGS.attn_size, FLAGS) fingerprint_size = test_fingerprints.shape[1] # (N x fingerprint_size) fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits, dropout_prob = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer predicted_indices = tf.argmax(logits, 1) print('\n\nFLAGS ===>', FLAGS) if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) names = dict() names[0] = 'silence' names[1] = 'unknown' names[2] = 'yes' names[3] = 'no' names[4] = 'up' names[5] = 'down' names[6] = 'left' names[7] = 'right' names[8] = 'on' names[9] = 'off' names[10] = 'stop' names[11] = 'go' with open('predictions.txt', 'w') as f: f.write('fname,label\n') bsize = FLAGS.batch_size for i in range(0, len(test_fingerprints), bsize): if i % 10000 == 0: print('batch: '+str(i)) en = min(i+bsize, len(test_fingerprints)) predictions = sess.run(predicted_indices, feed_dict={ fingerprint_input: test_fingerprints[i:en, :], dropout_prob: 1.0 }) for a, b in zip(filenames[i:en], predictions): f.write(a+','+names[b]+'\n')
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) training_steps = FLAGS.how_many_training_steps learning_rate = FLAGS.learning_rate # ----------------------------------------------------------------------- # -----------------------------Placeholder------------------------------- # ----------------------------------------------------------------------- fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits, dropout_prob, w_conv1, w_conv2 = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer ground_truth_input = tf.placeholder(tf.int64, [None], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # ----------------------------------------------------------------------- # -----------------Back propagation and training evaluation-------------- # ----------------------------------------------------------------------- reg_costant = 0.01 # Create the back propagation and training evaluation machinery in the graph. with tf.name_scope('cross_entropy'): # l2 regularization l2_reg = tf.reduce_sum( [tf.nn.l2_loss(w_conv1), tf.nn.l2_loss(w_conv2)]) cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy( labels=ground_truth_input, logits=logits) loss = cross_entropy_mean + reg_costant * l2_reg tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): #Adam optimizer train_step = tf.train.AdamOptimizer(learning_rate).minimize( cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) correct_prediction = tf.equal(predicted_indices, ground_truth_input) confusion_matrix = tf.confusion_matrix(ground_truth_input, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) # ----------------------------------------------------------------------- # -----------------Training and validation------------------------------- # ----------------------------------------------------------------------- # Training loop. training_steps_max = training_steps # Print the local time of beginning training beg_time = datetime.datetime.now() print("Beginning time : " + str(beg_time)) for training_step in xrange(start_step, training_steps_max + 1): # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, dropout_prob: 0.5 }) train_writer.add_summary(train_summary, training_step) tf.logging.info( 'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (training_step, learning_rate, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) # Validation if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, conf_matrix = sess.run( [merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) # Print the local time of ending training print("Beginning time : " + str(beg_time)) print("Ending time : " + str(datetime.datetime.now())) # ----------------------------------------------------------------------- # ------------------------------Test------------------------------------- # ----------------------------------------------------------------------- set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
dataset = dataset.repeat(num_epochs) #dataset = dataset.batch(batch_size) dataset = dataset.padded_batch(batch_size, padded_shapes=get_padded_shapes(dataset)) iterator = dataset.make_one_shot_iterator() #iterator = dataset.make_initializable_iterator() return iterator.get_next() # Tell TensorFlow that the model will be built into the default Graph. with tf.Graph().as_default(): # Set parameters to convey to the model model_settings = models.prepare_model_settings(FLAGS.num_classes) # Input images and labels label_batch, feat2d_batch, shape_batch = inputs( TRAIN_FILE, batch_size=FLAGS.batch_size, num_epochs=FLAGS.num_epochs) # Build a Graph that computes predictions from the model logits, dropout_prob = models.create_model( feat2d_batch, shape_batch, model_settings, FLAGS.model_architecture, is_training=True) # Define loss with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(
def main(_): words_list = input_data.prepare_words_list(FLAGS.wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.feature_bin_count, 'mfcc') audio_processor = input_data.AudioProcessor( '', FLAGS.data_dir, FLAGS.silence_percentage, 10, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings, FLAGS.data_dir) output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32) # Set up background audio. background_crossover_ms = 500 background_segment_duration_ms = ( FLAGS.clip_duration_ms + background_crossover_ms) background_segment_duration_samples = int( (background_segment_duration_ms * FLAGS.sample_rate) / 1000) background_segment_stride_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) background_ramp_samples = int( ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000) # Mix the background audio into the main track. how_many_backgrounds = int( math.ceil(output_audio_sample_count / background_segment_stride_samples)) for i in range(how_many_backgrounds): output_offset = int(i * background_segment_stride_samples) background_index = np.random.randint(len(audio_processor.background_data)) background_samples = audio_processor.background_data[background_index] background_offset = np.random.randint( 0, len(background_samples) - model_settings['desired_samples']) background_volume = np.random.uniform(0, FLAGS.background_volume) mix_in_audio_sample(output_audio, output_offset, background_samples, background_offset, background_segment_duration_samples, background_volume, background_ramp_samples, background_ramp_samples) # Mix the words into the main track, noting their labels and positions. output_labels = [] word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000) clip_duration_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000) how_many_words = int( math.floor(output_audio_sample_count / word_stride_samples)) all_test_data, all_test_labels = audio_processor.get_unprocessed_data( -1, model_settings, 'testing') for i in range(how_many_words): output_offset = ( int(i * word_stride_samples) + np.random.randint(word_gap_samples)) output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate is_unknown = np.random.randint(100) < FLAGS.unknown_percentage if is_unknown: wanted_label = input_data.UNKNOWN_WORD_LABEL else: wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)] test_data_start = np.random.randint(len(all_test_data)) found_sample_data = None index_lookup = np.arange(len(all_test_data), dtype=np.int32) np.random.shuffle(index_lookup) for test_data_offset in range(len(all_test_data)): test_data_index = index_lookup[( test_data_start + test_data_offset) % len(all_test_data)] current_label = all_test_labels[test_data_index] if current_label == wanted_label: found_sample_data = all_test_data[test_data_index] break mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0, clip_duration_samples, 1.0, 500, 500) output_labels.append({'label': wanted_label, 'time': output_offset_ms}) input_data.save_wav_file(FLAGS.output_audio_file, output_audio, FLAGS.sample_rate) tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file) with open(FLAGS.output_labels_file, 'w') as f: for output_label in output_labels: f.write('%s, %f\n' % (output_label['label'], output_label['time'])) tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc' or 'average'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def testPrepareModelSettings(self): self.assertIsNotNone( models.prepare_model_settings(10, 16000, 1000, 20, 10, 40))
def main(): parser = create_parser() argcomplete.autocomplete(parser) args = parser.parse_args() print_outputs = args.print_outputs sess = tf.InteractiveSession() model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(args.wanted_words.split(','))), args.sample_rate, args.clip_duration_ms, args.window_size_ms, args.window_stride_ms, args.dct_coefficient_count) # Build the audio processing graph + prepare data from dataset audio_processor = input_data.AudioProcessor(args.data_url, args.data_dir, args.silence_percentage, args.unknown_percentage, args.wanted_words.split(','), args.validation_percentage, args.testing_percentage, model_settings) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') # Build the NN graph if print_outputs: logits, first_conv_val, first_weights, first_bias, second_weights, second_bias, third_weights, second_conv_val = models.create_model( fingerprint_input, model_settings, args.model_architecture, is_training=False, print_outputs=True) else: logits = models.create_model(fingerprint_input, model_settings, args.model_architecture, is_training=False, print_outputs=False) # load weights/biases from checkpoint models.load_variables_from_checkpoint(sess, args.start_checkpoint) # Define loss and optimizer ground_truth_input = tf.placeholder(tf.float32, [None, label_count], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', evaluation_step) #generate test outputs set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) batch_size = args.batch_size directory = args.directory test_fingerprints, test_ground_truth = audio_processor.get_data( batch_size, 0, model_settings, 0.0, 0.0, 0, 'testing', sess) # Run evaluation on a batch if print_outputs: outfc, outconv1, weights1, bias1, weights2, bias2, weights3, outconv2, test_accuracy, expected, predicted = sess.run( [ logits, first_conv_val, first_weights, first_bias, second_weights, second_bias, third_weights, second_conv_val, evaluation_step, expected_indices, predicted_indices ], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth #dropout_prob: 1.0 }) else: outfc, test_accuracy, expected, predicted = sess.run( [logits, evaluation_step, expected_indices, predicted_indices], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth #dropout_prob: 1.0 }) print("expected/predicted") print(expected) print(predicted) # print image in a .h file, to be include for img_num in range(batch_size): in_feat = np.array(98 * 40) #print(test_fingerprints[img_num]*64) in_feat = np.reshape(test_fingerprints[img_num] * 64 + 0.5, (98 * 40)) for i in range(0, 40 * 98): in_feat[i] = math.floor(in_feat[i]) in_feat_int = np.array(98 * 40) in_feat_int = in_feat.astype(int) format = '%d' np.savetxt("./data/in_feat_{}_{}.txt".format(img_num, expected[img_num]), in_feat_int, delimiter=", ", newline=",\n", fmt=format) if print_outputs: #outconv1_2D = np.reshape(outconv1,(batch_size*32,79*33)) outconv1_2D = np.reshape(outconv1, (batch_size * 32, 39 * 16)) first_weights_2D = np.reshape(weights1, (32, 8 * 20)) weights2 = np.reshape(weights2, (10 * 4, 32, 32)) second_weights_2D = np.reshape(weights2.transpose(), (32 * 32, 4 * 10)) weights3 = np.reshape(weights3, (13 * 30, 32, 12)) print("SHAPE WEIGHTS3") print(np.shape(weights3)) third_weights_2D = np.reshape(weights3.transpose(), (12, 32 * 13 * 30)) outconv2_2D = np.reshape(outconv2, (batch_size * 32, 30 * 13)) print("SHAPE WEIGHTS2") print(np.shape(weights2)) np.savetxt("./data/outFC.txt", outfc, delimiter=",") np.savetxt("./data/outConv1.txt", outconv1_2D, delimiter=",") np.savetxt("./data/weights1.txt", first_weights_2D, delimiter=",") np.savetxt("./data/bias1.txt", bias1, delimiter=",") np.savetxt("./data/weights2.txt", second_weights_2D * 1024 * 32, delimiter=",") np.savetxt("./data/bias2.txt", bias2, delimiter=",") np.savetxt("./data/outConv2.txt", outconv2_2D, delimiter=",") tf.logging.info('test accuracy = %.1f%% (N=%d)' % (test_accuracy, batch_size)) np.savetxt("./data/weights3.txt", third_weights_2D * 1024 * 32, delimiter=",\n") # dump file in a 40*98 pgm image with 16bits pixels s_16b = np.array([40 * 98], dtype=np.uint16) # #shift left by 7 bits strout = '' for i in range(batch_size): s_16b = np.floor(test_fingerprints[i] * 64 + 0.5) # Q10.6 found in nntool s_8b = np.floor(test_fingerprints[i] / 2.40380199 + 0.5) # Scale found in nntool #print(s_16b) test_fingerprints[i].tofile("./images/features_float_{}.dat".format( str(i))) with open( os.path.join( directory, "features_q16_{}_{}_{}.pgm".format(expected[i], predicted[i], i)), 'wb') as f: hdr = 'P5' + '\n' + str(40) + ' ' + str(98) + ' ' + str( 65535) + '\n' f.write(hdr.encode()) np.int16(s_16b).tofile(f) with open( os.path.join( directory, "features_q8_{}_{}_{}.pgm".format(expected[i], predicted[i], i)), 'wb') as f: hdr = 'P5' + '\n' + str(40) + ' ' + str(98) + ' ' + str( 255) + '\n' f.write(hdr.encode()) np.int8(s_8b).tofile(f) strout += 'Input:\t./images/features_float_{}.dat\tExpected:\t{}\tPredicted:\t{}\t({})\n'.format( str(i), expected[i], predicted[i], outfc[i]) with open(os.path.join(directory, "output_expected_predicted.txt"), 'w') as f: f.write(strout) print("finished: test accuracy = %.1f%%" % (test_accuracy * 100))
def run_inference(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. model_size_info: Model dimensions : different lengths for different models """ tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] fingerprint_input = tf.placeholder( tf.float32, [None, fingerprint_size], name='fingerprint_input') logits = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, FLAGS.model_size_info, is_training=False) ground_truth_input = tf.placeholder( tf.float32, [None, label_count], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix( expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, FLAGS.checkpoint) # training set set_size = audio_processor.set_size('training') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): training_fingerprints, training_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'training', sess)) training_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: training_fingerprints, ground_truth_input: training_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (training_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Training accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size)) # validation set set_size = audio_processor.set_size('validation') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) validation_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Validation accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size)) # test set set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Test accuracy = %.2f%% (N=%d)' % (total_accuracy * 100, set_size))
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess) audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings, FLAGS.summaries_dir) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) input_placeholder = tf.placeholder( tf.float32, [None, fingerprint_size], name='fingerprint_input') if FLAGS.quantize: fingerprint_min, fingerprint_max = input_data.get_features_range( model_settings) fingerprint_input = tf.fake_quant_with_min_max_args( input_placeholder, fingerprint_min, fingerprint_max) else: fingerprint_input = input_placeholder logits, dropout_prob = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer ground_truth_input = tf.placeholder( tf.int64, [None], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy( labels=ground_truth_input, logits=logits) if FLAGS.quantize: tf.contrib.quantize.create_training_graph(quant_delay=0) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): learning_rate_input = tf.placeholder( tf.float32, [], name='learning_rate_input') train_step = tf.train.GradientDescentOptimizer( learning_rate_input).minimize(cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) correct_prediction = tf.equal(predicted_indices, ground_truth_input) confusion_matrix = tf.confusion_matrix( ground_truth_input, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) with tf.get_default_graph().name_scope('eval'): tf.summary.scalar('cross_entropy', cross_entropy_mean) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all(scope='eval') train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) # Training loop. training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step, ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, dropout_prob: 0.5 }) train_writer.add_summary(train_summary, training_step) tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, conf_matrix = sess.run( [merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Returns: Input and output tensor objects. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [], name='wav_data') decoded_sample_data = tf.audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = audio_ops.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = audio_ops.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) elif preprocess == "rune": fingerprint_input = np.random.uniform(0, 26, 1960).astype(np.float32) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. softmax = tf.nn.softmax(logits, name='labels_softmax') return reshaped_input, softmax
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() FLAGS.stock_codes = ts_stock_codes() FLAGS.stock_number = len(FLAGS.stock_codes) #Load trade price data. stock_codes = list(FLAGS.stock_codes) model_settings = models.prepare_model_settings(FLAGS.stock_number,FLAGS.data_out_number,\ FLAGS.proc_days,\ FLAGS.hidden1,FLAGS.hidden2) stock_data = input_data.StockTradeData(stock_codes, FLAGS.data_input_dir, FLAGS.data_output_dir, FLAGS.start_date, FLAGS.end_date, FLAGS.proc_days,FLAGS.verify_days,FLAGS.test_days) training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) model_settings['data_in_number']=model_settings['data_in_number']*\ stock_data.stocks_test_data.shape[2] data_input_number = model_settings['data_in_number'] stock_data_input = tf.placeholder( tf.float32, [None, data_input_number], name='data_input_number') logits, dropout_prob = models.create_model( stock_data_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer stock_data_output = tf.placeholder( tf.float32, [None,1], name='stock_data_output') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. with tf.name_scope('mean_squared'): mean_squared_error = tf.losses.mean_squared_error( labels=stock_data_output, predictions=logits) tf.summary.scalar('mean_squared', mean_squared_error) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): learning_rate_input = tf.placeholder( tf.float32, [], name='learning_rate_input') train_step = tf.train.GradientDescentOptimizer( learning_rate_input).minimize(mean_squared_error) stock_predicted_sign = tf.sign(logits) stock_truth_sign = tf.sign(stock_data_output) correct_prediction = tf.equal(stock_predicted_sign, stock_truth_sign) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: stock_close_price = stock_data.stocks_test_data.loc[:,:,'close'] f.write('\n'.join(str(np.sign(stock_close_price.loc[:,FLAGS.train_stock].values )))) # Training loop. training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break # Pull the audio samples we'll use for training. train_stock_truth, train_stock_input = stock_data.input_func(\ FLAGS.train_stock, FLAGS.batch_size, FLAGS.future_day,\ input_data.ProcessDataType.train) # Run the graph with this batch of training data. train_summary, train_accuracy, mean_squared_value, _, _ = sess.run( [ merged_summaries, evaluation_step, mean_squared_error, train_step, increment_global_step ], feed_dict={ stock_data_input: train_stock_input, stock_data_output: train_stock_truth, learning_rate_input: learning_rate_value, dropout_prob: 0.5 }) train_writer.add_summary(train_summary, training_step) tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, mean squared value %f' % (training_step, learning_rate_value, train_accuracy*100, mean_squared_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = FLAGS.verify_days total_accuracy = 0 #total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): train_stock_truth, train_stock_input = stock_data.input_func(\ FLAGS.train_stock, FLAGS.batch_size, FLAGS.future_day,\ input_data.ProcessDataType.verify) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, validation_mean_squared_value = sess.run( [merged_summaries, evaluation_step, mean_squared_error], feed_dict={ stock_data_input: train_stock_input, stock_data_output: train_stock_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size #if total_conf_matrix is None: # total_conf_matrix = conf_matrix #else: # total_conf_matrix += conf_matrix #tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) set_size = FLAGS.test_days tf.logging.info('set_size=%d', set_size) total_accuracy = 0 #total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): train_stock_truth,train_stock_input = stock_data.input_func(\ FLAGS.train_stock, FLAGS.batch_size, FLAGS.future_day,\ input_data.ProcessDataType.test) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. test_accuracy, test_mean_squared_value = sess.run( [evaluation_step, mean_squared_error], feed_dict={ stock_data_input: train_stock_input, stock_data_output: train_stock_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size #if total_conf_matrix is None: # total_conf_matrix = conf_matrix #else: # total_conf_matrix += conf_matrix #tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast( tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def wav_to_features(sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, quantize, preprocess, input_wav, output_c_file): """Converts an audio file into its corresponding feature map. Args: sample_rate: Expected sample rate of the wavs. clip_duration_ms: Expected duration in milliseconds of the wavs. window_size_ms: How long each spectrogram timeslice is. window_stride_ms: How far to move in time between spectogram timeslices. feature_bin_count: How many bins to use for the feature fingerprint. quantize: Whether to train the model for eight-bit deployment. preprocess: Spectrogram processing mode; "mfcc", "average" or "micro". input_wav: Path to the audio WAV file to read. output_c_file: Where to save the generated C source file. """ # Start a new TensorFlow session. sess = tf.InteractiveSession() model_settings = models.prepare_model_settings( 0, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) audio_processor = input_data.AudioProcessor(None, None, 0, 0, '', 0, 0, model_settings, None) results = audio_processor.get_features_for_wav(input_wav, model_settings, sess) features = results[0] variable_base = os.path.splitext(os.path.basename(input_wav).lower())[0] # Save a C source file containing the feature data as an array. with gfile.GFile(output_c_file, 'w') as f: f.write('/* File automatically created by\n') f.write(' * tensorflow/examples/speech_commands/wav_to_features.py \\\n') f.write(' * --sample_rate=%d \\\n' % sample_rate) f.write(' * --clip_duration_ms=%d \\\n' % clip_duration_ms) f.write(' * --window_size_ms=%d \\\n' % window_size_ms) f.write(' * --window_stride_ms=%d \\\n' % window_stride_ms) f.write(' * --feature_bin_count=%d \\\n' % feature_bin_count) if quantize: f.write(' * --quantize=1 \\\n') f.write(' * --preprocess="%s" \\\n' % preprocess) f.write(' * --input_wav="%s" \\\n' % input_wav) f.write(' * --output_c_file="%s" \\\n' % output_c_file) f.write(' */\n\n') f.write('const int g_%s_width = %d;\n' % (variable_base, model_settings['fingerprint_width'])) f.write('const int g_%s_height = %d;\n' % (variable_base, model_settings['spectrogram_length'])) if quantize: features_min, features_max = input_data.get_features_range(model_settings) f.write('const unsigned char g_%s_data[] = {' % variable_base) i = 0 for value in features.flatten(): quantized_value = int( round( (255 * (value - features_min)) / (features_max - features_min))) if quantized_value < 0: quantized_value = 0 if quantized_value > 255: quantized_value = 255 if i == 0: f.write('\n ') f.write('%d, ' % (quantized_value)) i = (i + 1) % 10 else: f.write('const float g_%s_data[] = {\n' % variable_base) i = 0 for value in features.flatten(): if i == 0: f.write('\n ') f.write(' ,%f' % value) i = (i + 1) % 10 f.write('\n};\n')
def main(_): # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) fingerprint_input = tf.placeholder( tf.float32, [None, fingerprint_size], name='fingerprint_input') logits, dropout_prob = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer ground_truth_input = tf.placeholder( tf.float32, [None, label_count], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=ground_truth_input, logits=logits)) tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): learning_rate_input = tf.placeholder( tf.float32, [], name='learning_rate_input') train_step = tf.train.GradientDescentOptimizer( learning_rate_input).minimize(cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) # Training loop. training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) # Run the graph with this batch of training data. train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, dropout_prob: 0.5 }) train_writer.add_summary(train_summary, training_step) tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_summary, validation_accuracy, conf_matrix = sess.run( [merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
def main(_): tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid #################################################################### model_settings = models.prepare_model_settings( len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) ####################################################################### fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) ################################################################## training_steps_list = map(int, FLAGS.how_many_training_steps.split(',')) learning_rates_list = map(float, FLAGS.learning_rate.split(',')) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size], name='fingerprint_input') logits, dropout_prob = models.create_model(fingerprint_input, model_settings, FLAGS.model_architecture, is_training=True) # Define loss and optimizer ground_truth_input = tf.placeholder(tf.float32, [None, label_count], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] ################################################################### #backpropagation with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=ground_truth_input, logits=logits)) tf.summary.scalar('cross_entropy', cross_entropy_mean) with tf.name_scope('train'), tf.control_dependencies(control_dependencies): learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input') train_step = tf.train.GradientDescentOptimizer( learning_rate_input).minimize(cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix(expected_indices, predicted_indices) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', evaluation_step) global_step = tf.contrib.framework.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables()) # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('step number: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) #################################################################### # Training loop. training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. training_steps_sum = 0 for i in range(len(training_steps_list)): training_steps_sum += training_steps_list[i] if training_step <= training_steps_sum: learning_rate_value = learning_rates_list[i] break # Pull the audio samples we'll use for training. train_fingerprints, train_ground_truth = audio_processor.get_data( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) # Run the graph with this batch of training data. ################################################################### train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, dropout_prob: 0.5 }) train_writer.add_summary(train_summary, training_step) tf.logging.info( 'Step number #%d: learning rate %f, model accuracy %.1f%%, model cross entropy %f' % (training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) #the graph are print using the library of tensorboard ############################################################# validation_summary, validation_accuracy, conf_matrix = sess.run( [merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, dropout_prob: 1.0 }) validation_writer.add_summary(validation_summary, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info( 'Step number %d: Validation Model accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) # Save the model checkpoint periodically. ############################################################### if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) ############################################################ set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) total_accuracy = 0 total_conf_matrix = None for i in xrange(0, set_size, FLAGS.batch_size): test_fingerprints, test_ground_truth = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) test_accuracy, conf_matrix = sess.run( [evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, dropout_prob: 1.0 }) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (test_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Final model accuracy accuracy = %.1f%% (N=%d)' % (total_accuracy * 100, set_size))
def main(_): best_acc = 0 best_step = 0 best_acc_istrain = 0 best_step_istrain = 0 # We want to see all the logging messages for this tutorial. tf.logging.set_verbosity(tf.logging.INFO) # Start a new TensorFlow session. sess = tf.InteractiveSession() # Begin by making sure we have the training data we need. If you already have # training data of your own, use `--data_url= ` on the command line to avoid # downloading. model_settings = models.prepare_model_settings( len(input_data_filler.prepare_words_list_my(FLAGS.wanted_words.split(','))), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data_filler.AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) fingerprint_size = model_settings['fingerprint_size'] label_count = model_settings['label_count'] time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000) # Figure out the learning rates for each training phase. Since it's often # effective to have high learning rates at the start of training, followed by # lower levels towards the end, the number of steps and learning rates can be # specified as comma-separated lists to define the rate at each stage. For # example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001 # will run 13,000 training loops in total, with a rate of 0.001 for the first # 10,000, and 0.0001 for the final 3,000. training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(','))) learning_rates_list = list(map(float, FLAGS.learning_rate.split(','))) if len(training_steps_list) != len(learning_rates_list): raise Exception( '--how_many_training_steps and --learning_rate must be equal length ' 'lists, but are %d and %d long instead' % (len(training_steps_list), len(learning_rates_list))) ############################################## ############tensorflow modules########## fingerprint_input = tf.placeholder( tf.float32, [None, fingerprint_size], name='fingerprint_input') # ############ 模型创建 ########## istrain = tf.placeholder(tf.bool, name='istrain') logits= models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, is_training=istrain) ############ 模型创建 ########## # logits, dropout_prob= models.create_model( # fingerprint_input, # model_settings, # FLAGS.model_architecture, # is_training=True) # Define loss and optimizer ############ 真实值 ########## ground_truth_input = tf.placeholder( tf.float32, [None, label_count], name='groundtruth_input') # Optionally we can add runtime checks to spot when NaNs or other symptoms of # numerical errors start occurring during training. control_dependencies = [] if FLAGS.check_nans: checks = tf.add_check_numerics_ops() control_dependencies = [checks] # Create the back propagation and training evaluation machinery in the graph. ############ 交叉熵计算 ########## # with tf.name_scope('cross_entropy'): # cross_entropy_mean = tf.reduce_mean( # tf.nn.softmax_cross_entropy_with_logits( # labels=ground_truth_input, logits=logits)) + beta*loss_norm with tf.name_scope('cross_entropy'): cross_entropy_mean = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=ground_truth_input, logits=logits)) tf.summary.scalar('cross_entropy', cross_entropy_mean) ############ 学习率、准确率、混淆矩阵 ########## # learning_rate_input 学习率输入(tf.placeholder) # train_step 训练过程 (优化器) # predicted_indices 预测输出索引 # expected_indices 实际希望输出索引 # correct_prediction 正确预测矩阵 # confusion_matrix 混淆矩阵 # evaluation_step 正确分类概率(每个阶段) # global_step 全局训练阶段 # increment_global_step 全局训练阶段递增 learning_rate_input = tf.placeholder( tf.float32, [], name='learning_rate_input') update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_step = tf.train.AdamOptimizer( learning_rate_input).minimize(cross_entropy_mean) # with tf.name_scope('train'), tf.control_dependencies(control_dependencies): # learning_rate_input = tf.placeholder( # tf.float32, [], name='learning_rate_input') # # train_step = tf.train.GradientDescentOptimizer( # # learning_rate_input).minimize(cross_entropy_mean) # with tf.control_dependencies(update_ops): # train_step = tf.train.AdamOptimizer( # learning_rate_input).minimize(cross_entropy_mean) predicted_indices = tf.argmax(logits, 1) expected_indices = tf.argmax(ground_truth_input, 1) correct_prediction = tf.equal(predicted_indices, expected_indices) confusion_matrix = tf.confusion_matrix( expected_indices, predicted_indices, num_classes=label_count) evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) acc = tf.summary.scalar('accuracy', evaluation_step) global_step = tf.train.get_or_create_global_step() increment_global_step = tf.assign(global_step, global_step + 1) saver = tf.train.Saver(tf.global_variables(),max_to_keep=None)# max keep file // moren 5 # Merge all the summaries and write them out to /tmp/retrain_logs (by default) merged_summaries = tf.summary.merge_all() validation_merged_summaries = tf.summary.merge([tf.get_collection(tf.GraphKeys.SUMMARIES,'accuracy'),tf.get_collection(tf.GraphKeys.SUMMARIES,'cross_entropy')]) test_summaries = tf.summary.merge([acc]) test_summaries_istrain = tf.summary.merge([acc]) train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train', sess.graph) validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/validation') test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test') test_istrain_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test_istrain') tf.global_variables_initializer().run() start_step = 1 if FLAGS.start_checkpoint: models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint) start_step = global_step.eval(session=sess) tf.logging.info('Training from step: %d ', start_step) # Save graph.pbtxt. tf.train.write_graph(sess.graph_def, FLAGS.train_dir, FLAGS.model_architecture + '.pbtxt') # Save list of words. with gfile.GFile( os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'), 'w') as f: f.write('\n'.join(audio_processor.words_list)) ### # model1: fc # model2: conv :940k个parameter # model3:low_latancy_conv:~~model1 # model4: 750k # Training loop. ############################################# ######## 主循环 ###### ############################################# training_steps_max = np.sum(training_steps_list) for training_step in xrange(start_step, training_steps_max + 1): # Figure out what the current learning rate is. ####### 自动切换学习率 ####### if training_step <12000+1: learning_rate_value = learning_rates_list[0]*0.02**(training_step/12000) else: learning_rate_value = learning_rates_list[0]*0.02 #0.015 12000 training_steps_sum = 0 # for i in range(len(training_steps_list)): # training_steps_sum += training_steps_list[i] # if training_step <= training_steps_sum: # learning_rate_value = learning_rates_list[i] # break # Pull the audio samples we'll use for training. ####### audio处理器导入数据 ################################## ##get_data(self, how_many, offset, model_settings, background_frequency, ## background_volume_range, time_shift, mode, sess) ######################################################################## train_fingerprints, train_ground_truth = audio_processor.get_data_my( FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency, FLAGS.background_volume, time_shift_samples, 'training', sess) #mid = np.abs(np.max(train_fingerprints) + np.min(train_fingerprints)) / 2 #half = np.max(train_fingerprints) - np.min(train_fingerprints) #train_fingerprints = ((train_fingerprints + mid) / half * 255).astype(int) #### 输入归一化 #### # train_fingerprints=input_normalization(train_fingerprints) # Run the graph with this batch of training data. train_fingerprints = np.round(train_fingerprints) train_fingerprints = np.clip(train_fingerprints, -100, 100) train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step ], feed_dict={ fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth, learning_rate_input: learning_rate_value, istrain:True }) train_writer.add_summary(train_summary, training_step) tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' % (training_step, learning_rate_value, train_accuracy * 100, cross_entropy_value)) is_last_step = (training_step == training_steps_max) if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step: set_size = audio_processor.set_size('validation') total_accuracy = 0 total_conf_matrix = None ############################################# ########交叉验证集重复计算正确率和混淆矩阵###### for i in xrange(0, set_size, FLAGS.batch_size): validation_fingerprints, validation_ground_truth = ( audio_processor.get_data_my(FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'validation', sess)) #mid = np.abs(np.max(validation_fingerprints) + np.min(validation_fingerprints)) / 2 # half = np.max(validation_fingerprints) - np.min(validation_fingerprints) #validation_fingerprints = ((validation_fingerprints + mid) / half * 255).astype(int) # #### 输入归一化 #### # validation_fingerprints = input_normalization(validation_fingerprints) # Run a validation step and capture training summaries for TensorBoard # with the `merged` op. validation_fingerprints = np.round(validation_fingerprints) validation_fingerprints = np.clip(validation_fingerprints,-100,100) validation_summaries, validation_accuracy, conf_matrix = sess.run( [validation_merged_summaries, evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth, istrain: True }) validation_writer.add_summary(validation_summaries, training_step) batch_size = min(FLAGS.batch_size, set_size - i) total_accuracy += (validation_accuracy * batch_size) / set_size if total_conf_matrix is None: total_conf_matrix = conf_matrix else: total_conf_matrix += conf_matrix tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' % (training_step, total_accuracy * 100, set_size)) ############################################# ######## 测试集重复计算正确率和混淆矩阵 ###### set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) test_fingerprints, test_ground_truth = audio_processor.get_data_my( -1, 0, model_settings, 0.0, 0.0, 0, 'testing', sess) #mid = np.abs(np.max(test_fingerprints) + np.min(test_fingerprints)) / 2 #half = np.max(test_fingerprints) - np.min(test_fingerprints) #test_fingerprints = ((test_fingerprints + mid) / half * 255).astype(int) test_fingerprints = np.round(test_fingerprints) test_fingerprints = np.clip(test_fingerprints, -100, 100) final_summary,test_accuracy, conf_matrix = sess.run( [test_summaries,evaluation_step, confusion_matrix], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, istrain : False }) final_summary_istrain,test_accuracy_istrain= sess.run( [test_summaries_istrain,evaluation_step], feed_dict={ fingerprint_input: test_fingerprints, ground_truth_input: test_ground_truth, istrain : True }) if test_accuracy > best_acc: best_acc = test_accuracy best_step = training_step if test_accuracy_istrain > best_acc_istrain: best_acc_istrain = test_accuracy_istrain best_step_istrain = training_step test_writer.add_summary(final_summary, training_step) test_istrain_writer.add_summary(final_summary_istrain, training_step) tf.logging.info('Confusion Matrix:\n %s' % (conf_matrix)) tf.logging.info('test accuracy = %.1f%% (N=%d)' % (test_accuracy * 100,6882)) tf.logging.info('test_istrain accuracy = %.1f%% (N=%d)' % (test_accuracy_istrain * 100,6882)) tf.logging.info('Best test accuracy before now = %.1f%% (N=%d)' % (best_acc * 100,6882) + ' at step of ' + str(best_step)) tf.logging.info('Best test_istrain accuracy before now = %.1f%% (N=%d)' % (best_acc_istrain * 100,6882) + ' at step of ' + str(best_step_istrain)) # Save the model checkpoint periodically. if (training_step % FLAGS.save_step_interval == 0 or training_step == training_steps_max): checkpoint_path = os.path.join(FLAGS.train_dir + '/'+FLAGS.model_architecture, FLAGS.model_architecture + '.ckpt') tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step) saver.save(sess, checkpoint_path, global_step=training_step) print_line = 'Best test accuracy before now = %.1f%% (N=%d)' % (best_acc * 100,6882) + ' at step of ' + str(best_step) + '\n' + \ 'Best test_istrain accuracy before now = %.1f%% (N=%d)' % (best_acc_istrain * 100,6882) + ' at step of ' + str(best_step_istrain) if training_step == training_steps_max: with open(FLAGS.train_dir + '/' +FLAGS.model_architecture+ '/details.txt', 'w') as f: f.write(print_line)
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc' or 'average'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') daudio = tf.identity(decoded_sample_data.audio, name='dao') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def main(_): num = 0 words_list = input_data_filler.prepare_words_list_my(FLAGS.wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) audio_processor = input_data_filler.AudioProcessor( '', FLAGS.data_dir, FLAGS.silence_percentage, 10, #unknown % FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) output_audio_sample_count = FLAGS.sample_rate * FLAGS.test_duration_seconds #test_duration_secongd output_audio = np.zeros((output_audio_sample_count,), dtype=np.float32) #600s # Set up background audio. background_crossover_ms = 500 background_segment_duration_ms = ( FLAGS.clip_duration_ms + background_crossover_ms) background_segment_duration_samples = int( (background_segment_duration_ms * FLAGS.sample_rate) / 1000) background_segment_stride_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) background_ramp_samples = int( ((background_crossover_ms / 2) * FLAGS.sample_rate) / 1000) #在1/2background crossover ms处音量变化 # Mix the background audio into the main track. how_many_backgrounds = int( math.ceil(output_audio_sample_count / background_segment_stride_samples)) for i in range(how_many_backgrounds): output_offset = int(i * background_segment_stride_samples) background_index = np.random.randint(len(audio_processor.background_data)) background_samples = audio_processor.background_data[background_index] background_offset = np.random.randint( 0, len(background_samples) - model_settings['desired_samples']) background_volume = np.random.uniform(0, FLAGS.background_volume) mix_in_audio_sample(output_audio, output_offset, background_samples, background_offset, background_segment_duration_samples, background_volume, background_ramp_samples, background_ramp_samples) #mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset, # clip_duration, sample_volume, ramp_in, ramp_out) # Mix the words into the main track, noting their labels and positions. output_labels = [] word_stride_ms = FLAGS.clip_duration_ms + FLAGS.word_gap_ms word_stride_samples = int((word_stride_ms * FLAGS.sample_rate) / 1000) clip_duration_samples = int( (FLAGS.clip_duration_ms * FLAGS.sample_rate) / 1000) word_gap_samples = int((FLAGS.word_gap_ms * FLAGS.sample_rate) / 1000) how_many_words = int( math.floor(output_audio_sample_count / word_stride_samples)) all_test_data, all_test_labels = audio_processor.get_unprocessed_data( -1, model_settings, 'testing') for i in range(how_many_words): output_offset = ( int(i * word_stride_samples) + np.random.randint(word_gap_samples)) #output_offset = ( # int(i * word_stride_samples)) output_offset_ms = (output_offset * 1000) / FLAGS.sample_rate is_unknown = np.random.randint(100) < FLAGS.unknown_percentage if is_unknown: wanted_label = input_data_filler.UNKNOWN_WORD_LABEL #wanted_label = 'unknown' num = num+1 print("is unknown " + str(num)) else: wanted_label = words_list[1 + np.random.randint(len(words_list) - 1)] #wanted_label = words_list[2 + np.random.randint(len(words_list) - 2)] test_data_start = np.random.randint(len(all_test_data)) found_sample_data = None index_lookup = np.arange(len(all_test_data), dtype=np.int32) np.random.shuffle(index_lookup) for test_data_offset in range(len(all_test_data)): test_data_index = index_lookup[( test_data_start + test_data_offset) % len(all_test_data)] current_label = all_test_labels[test_data_index] if current_label == wanted_label: found_sample_data = all_test_data[test_data_index] break # mix_in_audio_sample(track_data, track_offset, sample_data, sample_offset, # clip_duration, sample_volume, ramp_in, ramp_out) mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0, clip_duration_samples, 1.0, 325, 325) #mix_in_audio_sample(output_audio, output_offset, found_sample_data, 0, # clip_duration_samples, 1.0, 5, 5) #if not is_unknown: # output_labels.append({'label': wanted_label, 'time': output_offset_ms}) input_data_filler.save_wav_file(FLAGS.output_audio_file, output_audio, FLAGS.sample_rate) tf.logging.info('Saved streaming test wav to %s', FLAGS.output_audio_file) with open(FLAGS.output_labels_file, 'w') as f: for output_label in output_labels: f.write('%s, %f\n' % (output_label['label'], output_label['time'])) tf.logging.info('Saved streaming test labels to %s', FLAGS.output_labels_file)
def run_inference(wanted_words, sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture, model_size_info, use_mfcc, csv_writer): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. model_size_info: Model dimensions : different lengths for different models """ tf.logging.set_verbosity(tf.logging.INFO) sess = tf.InteractiveSession() words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, use_mfcc) # audio_processor = input_data.AudioProcessor( audio_processor = AudioProcessor( FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage, FLAGS.unknown_percentage, FLAGS.wanted_words.split(','), FLAGS.validation_percentage, FLAGS.testing_percentage, model_settings) label_count = model_settings['label_count'] fingerprint_size = model_settings['fingerprint_size'] fingerprint_input = tf.placeholder( tf.float32, [None, fingerprint_size], name='fingerprint_input') logits = models.create_model( fingerprint_input, model_settings, FLAGS.model_architecture, FLAGS.model_size_info, is_training=False) # ground_truth_input = tf.placeholder( # tf.float32, [None, label_count], name='groundtruth_input') predicted_indices = tf.argmax(logits, 1) # expected_indices = tf.argmax(ground_truth_input, 1) # correct_prediction = tf.equal(predicted_indices, expected_indices) # confusion_matrix = tf.confusion_matrix( # expected_indices, predicted_indices, num_classes=label_count) # evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) models.load_variables_from_checkpoint(sess, FLAGS.checkpoint) # # training set # set_size = audio_processor.set_size('training') # tf.logging.info('set_size=%d', set_size) # total_accuracy = 0 # total_conf_matrix = None # for i in xrange(0, set_size, FLAGS.batch_size): # training_fingerprints, training_ground_truth = ( # audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, # 0.0, 0, 'training', sess)) # training_accuracy, conf_matrix = sess.run( # [evaluation_step, confusion_matrix], # feed_dict={ # fingerprint_input: training_fingerprints, # ground_truth_input: training_ground_truth, # }) # batch_size = min(FLAGS.batch_size, set_size - i) # total_accuracy += (training_accuracy * batch_size) / set_size # if total_conf_matrix is None: # total_conf_matrix = conf_matrix # else: # total_conf_matrix += conf_matrix # tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) # tf.logging.info('Training accuracy = %.2f%% (N=%d)' % # (total_accuracy * 100, set_size)) # # # # validation set # set_size = audio_processor.set_size('validation') # tf.logging.info('set_size=%d', set_size) # total_accuracy = 0 # total_conf_matrix = None # for i in xrange(0, set_size, FLAGS.batch_size): # validation_fingerprints, validation_ground_truth = ( # audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0, # 0.0, 0, 'validation', sess)) # validation_accuracy, conf_matrix = sess.run( # [evaluation_step, confusion_matrix], # feed_dict={ # fingerprint_input: validation_fingerprints, # ground_truth_input: validation_ground_truth, # }) # batch_size = min(FLAGS.batch_size, set_size - i) # total_accuracy += (validation_accuracy * batch_size) / set_size # if total_conf_matrix is None: # total_conf_matrix = conf_matrix # else: # total_conf_matrix += conf_matrix # tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix)) # tf.logging.info('Validation accuracy = %.2f%% (N=%d)' % # (total_accuracy * 100, set_size)) # test set set_size = audio_processor.set_size('testing') tf.logging.info('set_size=%d', set_size) # total_accuracy = 0 # total_conf_matrix = None expected_classes = [] for i in xrange(0, set_size, FLAGS.batch_size): # test_fingerprints, test_ground_truth = audio_processor.get_data( # FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) # test_accuracy, conf_matrix = sess.run( # [evaluation_step, confusion_matrix], # feed_dict={ # fingerprint_input: test_fingerprints, # ground_truth_input: test_ground_truth, # }) test_fingerprints, test_fnames = audio_processor.get_data( FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess) expected_classes = sess.run( predicted_indices, feed_dict={ fingerprint_input: test_fingerprints, }) # batch_size = min(FLAGS.batch_size, set_size - i) # print ("i, len(expeceted_classes), len(test_fnames)=", i, len(expected_classes), len(test_fnames)) for j in range(min(FLAGS.batch_size, set_size - i)): csv_writer.writerow([test_fnames[j], class_labels[expected_classes[j]]])