Пример #1
0
def main(_):

    # Create the model and load its weights.
    sess = tf.InteractiveSession()
    create_inference_graph(FLAGS.wanted_words, FLAGS.sample_rate,
                           FLAGS.clip_duration_ms, FLAGS.clip_stride_ms,
                           FLAGS.window_size_ms, FLAGS.window_stride_ms,
                           FLAGS.dct_coefficient_count,
                           FLAGS.model_architecture, FLAGS.model_size_info)
    quant_models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)

    # Turn all the variables into inline constants inside the graph and save it.
    frozen_graph_def = graph_util.convert_variables_to_constants(
        sess, sess.graph_def, ['labels_softmax'])
    tf.train.write_graph(frozen_graph_def,
                         os.path.dirname(FLAGS.output_file),
                         os.path.basename(FLAGS.output_file),
                         as_text=False)
    tf.logging.info('Saved frozen graph to %s', FLAGS.output_file)
Пример #2
0
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms,
                        window_size_ms, window_stride_ms,
                        dct_coefficient_count, model_architecture,
                        model_size_info):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)

    audio_processor = input_data.AudioProcessor(FLAGS.data_url, FLAGS.data_dir,
                                                FLAGS.silence_percentage,
                                                FLAGS.unknown_percentage,
                                                FLAGS.wanted_words.split(','),
                                                FLAGS.validation_percentage,
                                                FLAGS.testing_percentage,
                                                model_settings)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']
    time_shift_samples = int((100.0 * FLAGS.sample_rate) / 1000)

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 FLAGS.model_architecture,
                                 FLAGS.model_size_info,
                                 FLAGS.act_max,
                                 is_training=False)
    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    if FLAGS.if_retrain:
        with tf.name_scope('cross_entropy'):
            cross_entropy_mean = tf.reduce_mean(
                tf.nn.softmax_cross_entropy_with_logits(
                    labels=ground_truth_input, logits=logits))
        tf.summary.scalar('cross_entropy', cross_entropy_mean)
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.name_scope('train'), tf.control_dependencies(update_ops):
            train_op = tf.train.AdamOptimizer(learning_rate=0.0001)
            train_step = tf.contrib.slim.learning.create_train_op(
                cross_entropy_mean, train_op)

    saver = tf.train.Saver(tf.global_variables())
    merged = tf.summary.merge_all()
    test_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/test',
                                        sess.graph)
    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train')
    validation_writer = tf.summary.FileWriter(FLAGS.summaries_dir +
                                              '/validation')
    tf.global_variables_initializer().run()

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)

    for v in tf.trainable_variables():
        var_name = str(v.name)
        var_values = sess.run(v)
        min_value = var_values.min()
        max_value = var_values.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        # ap_fixed<8,1> uses 7 decimal bits and 1 bit for sign
        dec_bits = 7 - int_bits
        # dec_bits = min(7, 7-int_bits)
        # convert to [-128,128) or int8
        # var_values = np.round(var_values*2**dec_bits)
        # convert back original range but quantized to 8-bits or 256 levels
        # var_values = var_values/(2**dec_bits)
        if FLAGS.update_weights:
            # define datatypes
            # f = open('weights/parameters.h','wb')
            # f.close()
            from save_data import prepare_header, prepare_lstm_headers
            var_name_split = var_name.split(':')
            if var_name_split[0].startswith('W_o'):
                os.makedirs('weights/fc', exist_ok=True)
                c_var_name = 'Wy[' + str(var_values.shape[1]) + '][' + str(
                    var_values.shape[0]) + ']'  # transposed
                np.savetxt('weights/fc/Wy.h',
                           np.transpose(var_values),
                           delimiter=',',
                           newline=',\n')
                prepare_header('weights/fc/Wy.h', 'Wy_t ' + c_var_name)
            elif var_name_split[0].startswith('b_o'):
                c_var_name = 'by[' + str(var_values.shape[0]) + ']'
                np.savetxt('weights/fc/by.h', var_values[None], delimiter=',')
                prepare_header('weights/fc/by.h', 'by_t ' + c_var_name)

            elif var_name_split[0].startswith('lstm'):
                lstm_name = var_name_split[0].split('/')
                param_name = lstm_name[-1]
                # if (lstm_name[0] == 'lstm0'):
                #   prepare_lstm_headers('weights/' + lstm_name[0], var_values,input_size = FLAGS.dct_coefficient_count, param_name=param_name)
                # else:
                #   state_size = FLAGS.model_size_info[0] # TODO
                #   prepare_lstm_headers('weights/' + lstm_name[0], var_values,input_size = state_size, param_name=param_name)

                # for lstmp
                if (lstm_name[-2] == 'projection'):
                    param_name = 'projection'
                if (lstm_name[1] == 'lstm0'):
                    prepare_lstm_headers(
                        'weights/' + lstm_name[1],
                        var_values,
                        input_size=FLAGS.dct_coefficient_count,
                        param_name=param_name)
                else:
                    state_size = FLAGS.model_size_info[0]  # TODO
                    prepare_lstm_headers('weights/' + lstm_name[1],
                                         var_values,
                                         input_size=state_size,
                                         param_name=param_name)

        # update the weights in tensorflow graph for quantizing the activations
        var_values = sess.run(tf.assign(v, var_values))
        print(var_name+' number of wts/bias: '+str(var_values.shape)+\
                ' dec bits: '+str(dec_bits)+\
                ' max: ('+str(var_values.max())+','+str(max_value)+')'+\
                ' min: ('+str(var_values.min())+','+str(min_value)+')')
    if FLAGS.if_retrain:
        best_accuracy = 0
        for training_step in range(FLAGS.retrain_steps):
            # Pull the audio samples we'll use for training.
            train_fingerprints, train_ground_truth = audio_processor.get_data(
                FLAGS.batch_size, 0, model_settings, 0.8, 0.1,
                time_shift_samples, 'training', sess)
            # Run the graph with this batch of training data.
            train_summary, train_accuracy, cross_entropy_value, _ = sess.run(
                [merged, evaluation_step, cross_entropy_mean, train_step],
                feed_dict={
                    fingerprint_input: train_fingerprints,
                    ground_truth_input: train_ground_truth
                })
            train_writer.add_summary(train_summary, training_step)
            tf.logging.info(
                'Step #%d: accuracy %.2f%%, cross entropy %f' %
                (training_step, train_accuracy * 100, cross_entropy_value))
            is_last_step = (training_step == FLAGS.retrain_steps)
            if (training_step % 200) == 0 or is_last_step:
                set_size = audio_processor.set_size('validation')
                total_accuracy = 0
                total_conf_matrix = None
                for i in range(0, set_size, FLAGS.batch_size):
                    validation_fingerprints, validation_ground_truth = (
                        audio_processor.get_data(FLAGS.batch_size, i,
                                                 model_settings, 0.0, 0.0, 0,
                                                 'validation', sess))

                    # Run a validation step and capture training summaries for TensorBoard
                    # with the `merged` op.
                    validation_summary, validation_accuracy, conf_matrix = sess.run(
                        [merged, evaluation_step, confusion_matrix],
                        feed_dict={
                            fingerprint_input: validation_fingerprints,
                            ground_truth_input: validation_ground_truth
                        })
                    validation_writer.add_summary(validation_summary,
                                                  training_step)
                    batch_size = min(FLAGS.batch_size, set_size - i)
                    total_accuracy += (validation_accuracy *
                                       batch_size) / set_size
                    if total_conf_matrix is None:
                        total_conf_matrix = conf_matrix
                    else:
                        total_conf_matrix += conf_matrix
                tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
                tf.logging.info(
                    'Step %d: Validation accuracy = %.2f%% (N=%d)' %
                    (training_step, total_accuracy * 100, set_size))

                # Save the model checkpoint when validation accuracy improves
                if total_accuracy > best_accuracy:
                    best_accuracy = total_accuracy
                    checkpoint_path = os.path.join(
                        FLAGS.new_checkpoint, FLAGS.model_architecture + '_' +
                        str(int(best_accuracy * 10000)) + '.ckpt')
                    tf.logging.info('Saving best model to "%s-%d"',
                                    checkpoint_path, training_step)
                    saver.save(sess,
                               checkpoint_path,
                               global_step=training_step)
                tf.logging.info(
                    'So far the best validation accuracy is %.2f%%' %
                    (best_accuracy * 100))

    # validation set
    set_size = audio_processor.set_size('validation')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in range(0, set_size, FLAGS.batch_size):
        validation_fingerprints, validation_ground_truth = (
            audio_processor.get_wav_files(FLAGS.batch_size, i, model_settings,
                                          'validation'))

        validation_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: validation_fingerprints,
                ground_truth_input: validation_ground_truth,
            })
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (validation_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Validation accuracy = %.2f%% (N=%d)' %
                    (total_accuracy * 100, set_size))

    # test set
    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in range(0, set_size, FLAGS.batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_wav_files(
            FLAGS.batch_size, i, model_settings, 'testing')

        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
            })

        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix

    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Test accuracy = %.2f%% (N=%d)' %
                    (total_accuracy * 100, set_size))
Пример #3
0
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms,
                        window_size_ms, window_stride_ms,
                        dct_coefficient_count, model_architecture,
                        model_size_info, act_max, data_url, data_dir,
                        silence_percentage, unknown_percentage, checkpoint,
                        batch_size, include_silence, lower_frequency_limit,
                        upper_frequency_limit, filterbank_channel_count,
                        is_bg_volume_constant, feature_extraction):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.

    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model_architecture: Name of the kind of model to generate.
      model_size_info: Model dimensions : different lengths for different models
    """
    tf.reset_default_graph()
    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','),
                                               silence_percentage != 0)
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, lower_frequency_limit,
        upper_frequency_limit, filterbank_channel_count)

    audio_processor = input_data.AudioProcessor(data_url, data_dir,
                                                silence_percentage,
                                                unknown_percentage,
                                                wanted_words.split(','), 0,
                                                100, model_settings)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info,
                                 act_max,
                                 is_training=False)

    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    models.load_variables_from_checkpoint(sess, checkpoint)

    for v in tf.trainable_variables():
        var_name = str(v.name)
        var_values = sess.run(v)
        min_value = var_values.min()
        max_value = var_values.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        dec_bits = 7 - int_bits
        # convert to [-128,128) or int8
        var_values = np.round(var_values * 2**dec_bits)
        var_values = var_values / (2**dec_bits)
        # update the weights in tensorflow graph for quantizing the activations
        var_values = sess.run(tf.assign(v, var_values))

        # test set
    set_size = audio_processor.set_size('testing')
    total_accuracy = 0
    total_conf_matrix = None
    for i in xrange(0, set_size, batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess,
            is_bg_volume_constant, feature_extraction)
        test_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
            })
        batch_size = min(batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix

    tf.reset_default_graph()
    sess.close()
    return total_accuracy
Пример #4
0
def run_full_quant_inference(
        wanted_words, sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, model_architecture,
        model_size_info, act_max, data_url, data_dir, silence_percentage,
        unknown_percentage, validation_percentage, testing_percentage,
        checkpoint, batch_size, lower_frequency_limit, upper_frequency_limit,
        filterbank_channel_count, is_bg_volume_constant, feature_extraction):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.

    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model_architecture: Name of the kind of model to generate.
      model_size_info: Model dimensions : different lengths for different models
    """

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','),
                                               silence_percentage != 0)
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count, lower_frequency_limit,
        upper_frequency_limit, filterbank_channel_count)

    audio_processor = input_data.AudioProcessor(
        data_url, data_dir, silence_percentage, unknown_percentage,
        wanted_words.split(','), validation_percentage, testing_percentage,
        model_settings)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info,
                                 act_max,
                                 is_training=False)

    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    models.load_variables_from_checkpoint(sess, checkpoint)

    num_layers = model_size_info[0]
    helper.write_ds_cnn_cpp_file('ds_cnn.cpp', num_layers)

    ds_cnn_h_fname = "ds_cnn.h"
    weights_h_fname = "ds_cnn_weights.h"

    f = open(ds_cnn_h_fname, 'wb')
    f.close()

    with open(ds_cnn_h_fname, 'a') as f:
        helper.write_ds_cnn_h_beginning(f, wanted_words, sample_rate,
                                        clip_duration_ms, window_size_ms,
                                        window_stride_ms,
                                        dct_coefficient_count, model_size_info,
                                        act_max)

    #   # Quantize weights to 8-bits using (min,max) and write to file
    f = open(weights_h_fname, 'wb')
    f.close()

    total_layers = len(act_max)
    layer_no = 1
    weights_dec_bits = 0
    for v in tf.trainable_variables():
        var_name = str(v.name)
        var_values = sess.run(v)
        min_value = var_values.min()
        max_value = var_values.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        dec_bits = 7 - int_bits
        # convert to [-128,128) or int8
        var_values = np.round(var_values * 2**dec_bits)
        var_name = var_name.replace('/', '_')
        var_name = var_name.replace(':', '_')

        if (len(var_values.shape) > 2):  # convolution layer weights
            transposed_wts = np.transpose(var_values, (3, 0, 1, 2))
        else:  # fully connected layer weights or biases of any layer
            transposed_wts = np.transpose(var_values)

        # convert back original range but quantized to 8-bits or 256 levels
        var_values = var_values / (2**dec_bits)
        # update the weights in tensorflow graph for quantizing the activations
        var_values = sess.run(tf.assign(v, var_values))
        print(var_name + ' number of wts/bias: ' + str(var_values.shape) + \
              ' dec bits: ' + str(dec_bits) + \
              ' max: (' + str(var_values.max()) + ',' + str(max_value) + ')' + \
              ' min: (' + str(var_values.min()) + ',' + str(min_value) + ')')

        conv_layer_no = layer_no // 2 + 1

        wt_or_bias = 'BIAS'
        if 'weights' in var_name:
            wt_or_bias = 'WT'

        with open(weights_h_fname, 'a') as f:
            if conv_layer_no == 1:
                f.write('#define CONV1_{} {{'.format(wt_or_bias))
            elif conv_layer_no <= num_layers:
                if layer_no % 2 == 0:
                    f.write('#define CONV{}_DS_{} {{'.format(
                        conv_layer_no, wt_or_bias))
                else:
                    f.write('#define CONV{}_PW_{} {{'.format(
                        conv_layer_no, wt_or_bias))
            else:
                f.write('#define FINAL_FC_{} {{'.format(wt_or_bias))

            transposed_wts.tofile(f, sep=", ", format="%d")
            f.write('}\n')

        if 'weights' in var_name:
            weights_dec_bits = dec_bits

        if 'biases' in var_name:
            if layer_no == total_layers - 2:  # if averege pool layer, go to the next one
                layer_no += 1
            input_dec_bits = 7 - np.log2(act_max[layer_no - 1])
            output_dec_bits = 7 - np.log2(act_max[layer_no])
            weights_x_input_dec_bits = input_dec_bits + weights_dec_bits
            bias_lshift = int(weights_x_input_dec_bits - dec_bits)
            output_rshift = int(weights_x_input_dec_bits - output_dec_bits)
            print(
                "Layer no: {} | Bias Lshift: {} | Output Rshift: {}\n".format(
                    layer_no, bias_lshift, output_rshift))
            with open('ds_cnn.h', 'a') as f:
                if conv_layer_no == 1:
                    f.write(
                        "#define CONV1_BIAS_LSHIFT {}\n".format(bias_lshift))
                    f.write(
                        "#define CONV1_OUT_RSHIFT {}\n".format(output_rshift))
                elif conv_layer_no <= num_layers:
                    if layer_no % 2 == 0:
                        f.write("#define CONV{}_DS_BIAS_LSHIFT {}\n".format(
                            conv_layer_no, bias_lshift))
                        f.write("#define CONV{}_DS_OUT_RSHIFT {}\n".format(
                            conv_layer_no, output_rshift))

                    else:
                        f.write("#define CONV{}_PW_BIAS_LSHIFT {}\n".format(
                            conv_layer_no, bias_lshift))
                        f.write("#define CONV{}_PW_OUT_RSHIFT {}\n".format(
                            conv_layer_no, output_rshift))
                else:
                    f.write("#define FINAL_FC_BIAS_LSHIFT {}\n".format(
                        bias_lshift))
                    f.write("#define FINAL_FC_OUT_RSHIFT {}\n".format(
                        output_rshift))

            layer_no += 1
    input_dec_bits = 7 - np.log2(act_max[len(act_max) - 3])
    output_dec_bits = 7 - np.log2(act_max[len(act_max) - 2])
    with open(ds_cnn_h_fname, 'a') as f:
        f.write("#define AVG_POOL_OUT_LSHIFT {}\n\n".format(
            int(output_dec_bits - input_dec_bits)))
        helper.write_ds_cnn_h_end(f, num_layers)

    # Evaluate result after quantization on testing set
    set_size = audio_processor.set_size('testing')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None

    for i in xrange(0, set_size, batch_size):
        test_fingerprints, test_ground_truth = audio_processor.get_data(
            batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess,
            is_bg_volume_constant, feature_extraction)
        test_accuracy, conf_matrix, predictions, true_labels = sess.run(
            [
                evaluation_step, confusion_matrix, predicted_indices,
                expected_indices
            ],
            feed_dict={
                fingerprint_input: test_fingerprints,
                ground_truth_input: test_ground_truth,
            })

        batch_size = min(batch_size, set_size - i)
        total_accuracy += (test_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix

    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Test accuracy = %.2f%% (N=%d)' %
                    (total_accuracy * 100, set_size))
    sess.close()
Пример #5
0
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms,
                           window_size_ms, window_stride_ms, dct_coefficient_count, 
                           model_architecture, model_size_info):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """
  
  tf.logging.set_verbosity(tf.logging.INFO)
  sess = tf.InteractiveSession()
  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)

  audio_processor = input_data.AudioProcessor(
      FLAGS.data_url, FLAGS.data_dir, FLAGS.silence_percentage,
      FLAGS.unknown_percentage,
      FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
      FLAGS.testing_percentage, model_settings)
  
  label_count = model_settings['label_count']
  fingerprint_size = model_settings['fingerprint_size']

  fingerprint_input = tf.placeholder(
      tf.float32, [None, fingerprint_size], name='fingerprint_input')

  logits = models.create_model(
      fingerprint_input,
      model_settings,
      FLAGS.model_architecture,
      FLAGS.model_size_info,
      FLAGS.act_max,
      is_training=False)

  ground_truth_input = tf.placeholder(
      tf.float32, [None, label_count], name='groundtruth_input')

  predicted_indices = tf.argmax(logits, 1)
  expected_indices = tf.argmax(ground_truth_input, 1)
  correct_prediction = tf.equal(predicted_indices, expected_indices)
  confusion_matrix = tf.confusion_matrix(
      expected_indices, predicted_indices, num_classes=label_count)
  evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
  models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)


  # Quantize weights to 8-bits using (min,max) and write to file
  f = open('weights.h','wb')
  f.close()

  for v in tf.trainable_variables():
    var_name = str(v.name)
    var_values = sess.run(v)
    min_value = var_values.min()
    max_value = var_values.max()
    int_bits = int(np.ceil(np.log2(max(abs(min_value),abs(max_value)))))
    dec_bits = 7-int_bits
    # convert to [-128,128) or int8
    var_values = np.round(var_values*2**dec_bits)
    var_name = var_name.replace('/','_')
    var_name = var_name.replace(':','_')
    with open('weights.h','a') as f:
      f.write('#define '+var_name+' {')
    if(len(var_values.shape)>2): #convolution layer weights
      transposed_wts = np.transpose(var_values,(3,0,1,2))
    else: #fully connected layer weights or biases of any layer
      transposed_wts = np.transpose(var_values)
    with open('weights.h','a') as f:
      transposed_wts.tofile(f,sep=", ",format="%d")
      f.write('}\n')
    # convert back original range but quantized to 8-bits or 256 levels
    var_values = var_values/(2**dec_bits)
    # update the weights in tensorflow graph for quantizing the activations
    var_values = sess.run(tf.assign(v,var_values))
    print(var_name+' number of wts/bias: '+str(var_values.shape)+\
            ' dec bits: '+str(dec_bits)+\
            ' max: ('+str(var_values.max())+','+str(max_value)+')'+\
            ' min: ('+str(var_values.min())+','+str(min_value)+')')
  
  # training set
  set_size = audio_processor.set_size('training')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in range(0, set_size, FLAGS.batch_size):
    training_fingerprints, training_ground_truth = (
        audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                 0.0, 0, 'training', sess))
    training_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: training_fingerprints,
            ground_truth_input: training_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (training_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Training accuracy = %.2f%% (N=%d)' %
                  (total_accuracy * 100, set_size))

  # validation set
  set_size = audio_processor.set_size('validation')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in range(0, set_size, FLAGS.batch_size):
    validation_fingerprints, validation_ground_truth = (
        audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
                                 0.0, 0, 'validation', sess))
    validation_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: validation_fingerprints,
            ground_truth_input: validation_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (validation_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Validation accuracy = %.2f%% (N=%d)' %
                  (total_accuracy * 100, set_size))
  
  # test set
  set_size = audio_processor.set_size('testing')
  tf.logging.info('set_size=%d', set_size)
  total_accuracy = 0
  total_conf_matrix = None
  for i in range(0, set_size, FLAGS.batch_size):
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
    test_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: test_fingerprints,
            ground_truth_input: test_ground_truth,
        })
    batch_size = min(FLAGS.batch_size, set_size - i)
    total_accuracy += (test_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
      total_conf_matrix = conf_matrix
    else:
      total_conf_matrix += conf_matrix
  tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
  tf.logging.info('Test accuracy = %.2f%% (N=%d)' % (total_accuracy * 100,
                                                           set_size))
Пример #6
0
def run_quant_inference(act_max):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """

    wanted_words = FLAGS.wanted_words
    sample_rate = FLAGS.sample_rate
    clip_duration_ms = FLAGS.clip_duration_ms
    window_size_ms = FLAGS.window_size_ms
    window_stride_ms = FLAGS.window_stride_ms
    dct_coefficient_count = FLAGS.dct_coefficient_count
    model_architecture = FLAGS.model_architecture
    model_size_info = FLAGS.model_size_info

    total_layers = len(act_max)
    layer_no = 1
    weights_dec_bits = 0

    tf.reset_default_graph()
    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)

    if FLAGS.validation_dir is None:
        FLAGS.validation_dir = FLAGS.data_dir
    validation_audio_processor = input_data.AudioProcessor(
        FLAGS.data_url, FLAGS.validation_dir,
        FLAGS.silence_percentage, FLAGS.unknown_percentage,
        FLAGS.wanted_words.split(','), 100, 0, model_settings)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info,
                                 act_max,
                                 is_training=False)

    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)

    # Quantize weights to 8-bits using (min,max)

    for v in tf.trainable_variables():
        var_name = str(v.name)
        var_values = sess.run(v)
        min_value = var_values.min()
        max_value = var_values.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        dec_bits = 7 - int_bits
        # convert to [-128,128) or int8
        var_values = np.round(var_values * 2**dec_bits)
        # convert back original range but quantized to 8-bits or 256 levels
        var_values = var_values / (2**dec_bits)
        # update the weights in tensorflow graph for quantizing the activations
        var_values = sess.run(tf.assign(v, var_values))

        if 'weights' in var_name:
            weights_dec_bits = dec_bits

        if 'biases' in var_name:
            # if averege pool layer, go to the next one
            if layer_no == total_layers - 2:
                layer_no += 1

            if act_max[layer_no] != 0 and act_max[layer_no - 1] != 0:
                input_dec_bits = 7 - np.log2(act_max[layer_no - 1])
                output_dec_bits = 7 - np.log2(act_max[layer_no])
                weights_x_input_dec_bits = input_dec_bits + weights_dec_bits
                bias_lshift = int(weights_x_input_dec_bits - dec_bits)
                output_rshift = int(weights_x_input_dec_bits - output_dec_bits)
                if bias_lshift < 0 or output_rshift < 0:
                    print("CMSIS-5 NN doesn't support negative shift now!")
                    tf.reset_default_graph()
                    sess.close()
                    return -1

            layer_no += 1

    # validation set
    set_size = validation_audio_processor.set_size('validation')
    tf.logging.info('set_size=%d', set_size)
    total_accuracy = 0
    total_conf_matrix = None
    for i in xrange(0, set_size, FLAGS.batch_size):
        validation_fingerprints, validation_ground_truth = (
            validation_audio_processor.get_data(FLAGS.batch_size, i,
                                                model_settings, 0.0, 0.0, 0,
                                                'validation', sess))
        validation_accuracy, conf_matrix = sess.run(
            [evaluation_step, confusion_matrix],
            feed_dict={
                fingerprint_input: validation_fingerprints,
                ground_truth_input: validation_ground_truth,
            })
        batch_size = min(FLAGS.batch_size, set_size - i)
        total_accuracy += (validation_accuracy * batch_size) / set_size
        if total_conf_matrix is None:
            total_conf_matrix = conf_matrix
        else:
            total_conf_matrix += conf_matrix
    tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
    tf.logging.info('Validation accuracy = %.2f%% (N=%d)' %
                    (total_accuracy * 100, set_size))

    tf.reset_default_graph()
    sess.close()
    return total_accuracy
Пример #7
0
def run_quant_inference(wanted_words, sample_rate, clip_duration_ms,
                        window_size_ms, window_stride_ms,
                        dct_coefficient_count, model_architecture,
                        model_size_info):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    model_size_info: Model dimensions : different lengths for different models
  """

    act_max = FLAGS.act_max
    for maxium in act_max:
        if maxium == 0:
            print('Calling quant_act_max.py to get best act_max')
            quant_act_max.FLAGS = FLAGS
            act_max = quant_act_max.get_best_act_max(act_max)

    tf.logging.set_verbosity(tf.logging.INFO)
    sess = tf.InteractiveSession()
    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, dct_coefficient_count)

    label_count = model_settings['label_count']
    fingerprint_size = model_settings['fingerprint_size']

    fingerprint_input = tf.placeholder(tf.float32, [None, fingerprint_size],
                                       name='fingerprint_input')

    logits = models.create_model(fingerprint_input,
                                 model_settings,
                                 model_architecture,
                                 model_size_info,
                                 act_max,
                                 is_training=False)

    ground_truth_input = tf.placeholder(tf.float32, [None, label_count],
                                        name='groundtruth_input')

    predicted_indices = tf.argmax(logits, 1)
    expected_indices = tf.argmax(ground_truth_input, 1)
    correct_prediction = tf.equal(predicted_indices, expected_indices)
    confusion_matrix = tf.confusion_matrix(expected_indices,
                                           predicted_indices,
                                           num_classes=label_count)
    evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    models.load_variables_from_checkpoint(sess, FLAGS.checkpoint)

    # Quantize weights to 8-bits using (min,max) and write to file
    f = open('weights.h', 'wb')
    f.close()

    if model_architecture == "ds_cnn":
        num_layers = model_size_info[0]
        helper.write_ds_cnn_c_file('ds_cnn.c', num_layers)

        ds_cnn_h_fname = "ds_cnn.h"
        weights_h_fname = "ds_cnn_weights.h"

        f = open(ds_cnn_h_fname, 'wb')
        f.close()

        with open(ds_cnn_h_fname, 'a') as f:
            helper.write_ds_cnn_h_beginning(f, wanted_words, sample_rate,
                                            clip_duration_ms, window_size_ms,
                                            window_stride_ms,
                                            dct_coefficient_count,
                                            model_size_info, act_max)

        # Quantize weights to 8-bits using (min,max) and write to file
        f = open(weights_h_fname, 'wb')
        f.close()

        total_layers = len(act_max)
        layer_no = 1
        weights_dec_bits = 0

    for v in tf.trainable_variables():
        var_name = str(v.name)
        var_values = sess.run(v)
        min_value = var_values.min()
        max_value = var_values.max()
        int_bits = int(np.ceil(np.log2(max(abs(min_value), abs(max_value)))))
        dec_bits = 7 - int_bits
        # convert to [-128,128) or int8
        var_values = np.round(var_values * 2**dec_bits)
        var_name = var_name.replace('/', '_')
        var_name = var_name.replace(':', '_')
        with open('weights.h', 'a') as f:
            f.write('#define ' + var_name + ' {')
        if (len(var_values.shape) > 2):  #convolution layer weights
            transposed_wts = np.transpose(var_values, (3, 0, 1, 2))
        else:  #fully connected layer weights or biases of any layer
            transposed_wts = np.transpose(var_values)
        with open('weights.h', 'a') as f:
            transposed_wts.tofile(f, sep=", ", format="%d")
            f.write('}\n')
        # convert back original range but quantized to 8-bits or 256 levels
        var_values = var_values / (2**dec_bits)
        # update the weights in tensorflow graph for quantizing the activations
        var_values = sess.run(tf.assign(v, var_values))
        print(var_name+' number of wts/bias: '+str(var_values.shape)+\
                ' dec bits: '+str(dec_bits)+\
                ' max: ('+str(var_values.max())+','+str(max_value)+')'+\
                ' min: ('+str(var_values.min())+','+str(min_value)+')')

        if model_architecture == "ds_cnn":
            conv_layer_no = layer_no // 2 + 1

            wt_or_bias = 'BIAS'
            if 'weights' in var_name:
                wt_or_bias = 'WT'

            with open(weights_h_fname, 'a') as f:
                if conv_layer_no == 1:
                    f.write('#define CONV1_{} {{'.format(wt_or_bias))
                elif conv_layer_no <= num_layers:
                    if layer_no % 2 == 0:
                        f.write('#define CONV{}_DS_{} {{'.format(
                            conv_layer_no, wt_or_bias))
                    else:
                        f.write('#define CONV{}_PW_{} {{'.format(
                            conv_layer_no, wt_or_bias))
                else:
                    f.write('#define FINAL_FC_{} {{'.format(wt_or_bias))

                transposed_wts.tofile(f, sep=", ", format="%d")
                f.write('}\n')

            if 'weights' in var_name:
                weights_dec_bits = dec_bits

            if 'biases' in var_name:
                # if averege pool layer, go to the next one
                if layer_no == total_layers - 2:
                    layer_no += 1

                input_dec_bits = 7 - np.log2(act_max[layer_no - 1])
                output_dec_bits = 7 - np.log2(act_max[layer_no])
                weights_x_input_dec_bits = input_dec_bits + weights_dec_bits
                bias_lshift = int(weights_x_input_dec_bits - dec_bits)
                output_rshift = int(weights_x_input_dec_bits - output_dec_bits)
                print("Layer no: {} | Bias Lshift: {} | Output Rshift: {}\n".
                      format(layer_no, bias_lshift, output_rshift))
                with open('ds_cnn.h', 'a') as f:
                    if conv_layer_no == 1:
                        f.write("#define CONV1_BIAS_LSHIFT {}\n".format(
                            bias_lshift))
                        f.write("#define CONV1_OUT_RSHIFT {}\n".format(
                            output_rshift))
                    elif conv_layer_no <= num_layers:
                        if layer_no % 2 == 0:
                            f.write(
                                "#define CONV{}_DS_BIAS_LSHIFT {}\n".format(
                                    conv_layer_no, bias_lshift))
                            f.write("#define CONV{}_DS_OUT_RSHIFT {}\n".format(
                                conv_layer_no, output_rshift))
                        else:
                            f.write(
                                "#define CONV{}_PW_BIAS_LSHIFT {}\n".format(
                                    conv_layer_no, bias_lshift))
                            f.write("#define CONV{}_PW_OUT_RSHIFT {}\n".format(
                                conv_layer_no, output_rshift))
                    else:
                        f.write("#define FINAL_FC_BIAS_LSHIFT {}\n".format(
                            bias_lshift))
                        f.write("#define FINAL_FC_OUT_RSHIFT {}\n".format(
                            output_rshift))

                layer_no += 1

    if model_architecture == "ds_cnn":
        input_dec_bits = 7 - np.log2(act_max[len(act_max) - 3])
        output_dec_bits = 7 - np.log2(act_max[len(act_max) - 2])

        if input_dec_bits > output_dec_bits:
            output_dec_bits = input_dec_bits

        with open(ds_cnn_h_fname, 'a') as f:
            f.write("#define AVG_POOL_OUT_LSHIFT {}\n\n".format(
                int(output_dec_bits - input_dec_bits)))
            helper.write_ds_cnn_h_end(f, num_layers)