def testPreprocessStreamInferenceModeTFandTFLite(self, preprocess, feature_type, model_name='gru'): # Validate that model with different preprocessing # can be converted to stream inference mode with TF and TFLite. params = model_params.HOTWORD_MODEL_PARAMS[model_name] # set parameters to test params.preprocess = preprocess params.feature_type = feature_type params = model_flags.update_flags(params) # create model model = models.MODELS[params.model_name](params) # convert TF non streaming model to TFLite streaming inference # with external states self.assertTrue( utils.model_to_tflite(self.sess, model, params, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE)) # convert TF non streaming model to TF streaming with external states self.assertTrue( utils.to_streaming_inference( model, params, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE)) # convert TF non streaming model to TF streaming with internal states self.assertTrue( utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE))
def _RunGetDataTest(self, preprocess, window_size_ms): tmp_dir = self.get_temp_dir() wav_dir = os.path.join(tmp_dir, "wavs") os.mkdir(wav_dir) self._SaveWavFolders(wav_dir, ["a", "b", "c"], 100) background_dir = os.path.join(wav_dir, "_background_noise_") os.mkdir(background_dir) wav_data = self._GetWavData() for i in range(10): file_path = os.path.join(background_dir, "background_audio_%d.wav" % i) self._SaveTestWavFile(file_path, wav_data) flags = self._GetDefaultFlags() flags.window_size_ms = window_size_ms flags.preprocess = preprocess flags.train_dir = tmp_dir flags.data_dir = wav_dir flags = model_flags.update_flags(flags) with self.cached_session() as sess: audio_processor = input_data.AudioProcessor(flags) result_data, result_labels = audio_processor.get_data( 10, 0, flags, 0.3, 0.1, 100, "training", 0.0, 0.0, sess) self.assertLen(result_data, 10) self.assertLen(result_labels, 10)
def test_model_to_saved(self, model_name='dnn'): """SavedModel supports both stateless and stateful graphs.""" params = model_params.HOTWORD_MODEL_PARAMS[model_name] params = model_flags.update_flags(params) # create model model = models.MODELS[params.model_name](params) utils.model_to_saved(model, params, FLAGS.test_tmpdir)
def ds_tc_resnet_model_params(use_tf_fft=False): """Generate parameters for ds_tc_resnet model.""" # model parameters model_name = 'ds_tc_resnet' params = model_params.HOTWORD_MODEL_PARAMS[model_name] params.causal_data_frame_padding = 1 # causal padding on DataFrame params.clip_duration_ms = 160 params.use_tf_fft = use_tf_fft params.mel_non_zero_only = not use_tf_fft params.feature_type = 'mfcc_tf' params.window_size_ms = 5.0 params.window_stride_ms = 2.0 params.wanted_words = 'a,b,c' params.ds_padding = "'causal','causal','causal','causal'" params.ds_filters = '4,4,4,2' params.ds_repeat = '1,1,1,1' params.ds_residual = '0,1,1,1' # no residuals on strided layers params.ds_kernel_size = '3,3,3,1' params.ds_dilation = '1,1,1,1' params.ds_stride = '2,1,1,1' # streaming conv with stride params.ds_pool = '1,2,1,1' # streaming conv with pool params.ds_filter_separable = '1,1,1,1' # convert ms to samples and compute labels count params = model_flags.update_flags(params) # compute total stride pools = model_utils.parse(params.ds_pool) strides = model_utils.parse(params.ds_stride) time_stride = [1] for pool in pools: if pool > 1: time_stride.append(pool) for stride in strides: if stride > 1: time_stride.append(stride) total_stride = np.prod(time_stride) # override input data shape for streaming model with stride/pool params.data_stride = total_stride params.data_shape = (total_stride * params.window_stride_samples,) # set desired number of frames in model frames_number = 16 frames_per_call = total_stride frames_number = (frames_number // frames_per_call) * frames_per_call # number of input audio samples required to produce one output frame framing_stride = max( params.window_stride_samples, max(0, params.window_size_samples - params.window_stride_samples)) signal_size = framing_stride * frames_number # desired number of samples in the input data to train non streaming model params.desired_samples = signal_size params.batch_size = 1 return params
def testTrain(self, split_data): input_flags = self._GetDefaultFlags(split_data) input_flags = model_flags.update_flags(input_flags) train.train(input_flags) self.assertTrue( tf.io.gfile.exists(os.path.join(input_flags.train_dir, 'graph.pbtxt'))) self.assertTrue( tf.io.gfile.exists(os.path.join(input_flags.train_dir, 'labels.txt'))) self.assertTrue( tf.io.gfile.exists( os.path.join(input_flags.train_dir, 'accuracy_last.txt')))
def get_model_with_default_params(model_name, mode=None): """Creates a model with the params specified in HOTWORD_MODEL_PARAMS.""" if model_name not in model_params.HOTWORD_MODEL_PARAMS: raise KeyError( "Expected 'model_name' to be one of " f"{model_params.HOTWORD_MODEL_PARAMS.keys} but got '{model_name}'.") params = model_params.HOTWORD_MODEL_PARAMS[model_name] params = model_flags.update_flags(params) model = kws_models.MODELS[params.model_name](params) if mode is not None: model = to_streaming_inference(model, flags=params, mode=mode) return model
def _GetDefaultFlags(self, split_data): params = model_params.dnn_params() params.data_dir = self._PrepareDummyTrainingData( ) if split_data == 1 else self._PrepareDummyTrainingDataSplit() params.wanted_words = 'a,b,c' params.split_data = split_data params.summaries_dir = self._PrepareDummyDir('summaries' + str(split_data)) params.train_dir = self._PrepareDummyDir('train' + str(split_data)) params.how_many_training_steps = '2' params.learning_rate = '0.01' params.eval_step_interval = 1 params.save_step_interval = 1 params.clip_duration_ms = 100 params.batch_size = 1 return model_flags.update_flags(params)
def _testTFLite(self, preprocess='raw', feature_type='mfcc_op', model_name='svdf'): params = model_params.HOTWORD_MODEL_PARAMS[model_name] params.clip_duration_ms = 100 # make it shorter for testing # set parameters to test params.preprocess = preprocess params.feature_type = feature_type params = model_flags.update_flags(params) # create model model = models.MODELS[params.model_name](params) # convert TF non streaming model to TFLite non streaming inference self.assertTrue( utils.model_to_tflite(self.sess, model, params, modes.Modes.NON_STREAM_INFERENCE))
def main(_): # Update flags flags = model_flags.update_flags(FLAGS) if flags.train: # Create model folders where logs and model will be stored os.makedirs(flags.train_dir) os.mkdir(flags.summaries_dir) # Model training train.train(flags) else: if not os.path.isdir(flags.train_dir): raise ValueError( 'model is not trained set "--train 1" and retrain it') # write all flags settings into json with open(os.path.join(flags.train_dir, 'flags.json'), 'wt') as f: json.dump(flags.__dict__, f) # convert to SavedModel test.convert_model_saved(flags, 'non_stream', modes.Modes.NON_STREAM_INFERENCE) try: test.convert_model_saved(flags, 'stream_state_internal', modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) except (ValueError, IndexError) as e: logging.info('FAILED to run TF streaming: %s', e) logging.info('run TF non streaming model accuracy evaluation') # with TF folder_name = 'tf' test.tf_non_stream_model_accuracy(flags, folder_name) # with TF. # We can apply non stream model on stream data, by running inference # every 200ms (for example), so that total latency will be similar with # streaming model which is executed every 20ms. # To measure the impact of sampling on model accuracy, # we introduce time_shift_ms during accuracy evaluation. # Convert milliseconds to samples: time_shift_samples = int( (flags.time_shift_ms * flags.sample_rate) / model_flags.MS_PER_SECOND) test.tf_non_stream_model_accuracy( flags, folder_name, time_shift_samples, accuracy_name='tf_non_stream_model_sampling_stream_accuracy.txt') name2opt = { '': None, 'quantize_opt_for_size_': [tf.lite.Optimize.DEFAULT], } for opt_name, optimizations in name2opt.items(): if (opt_name and flags.feature_type == 'mfcc_tf' and flags.preprocess == 'raw'): logging.info( 'feature type mfcc_tf needs quantization aware training ' 'for quantization - it is not implemented') continue folder_name = opt_name + 'tflite_non_stream' file_name = 'non_stream.tflite' mode = modes.Modes.NON_STREAM_INFERENCE test.convert_model_tflite(flags, folder_name, mode, file_name, optimizations=optimizations) test.tflite_non_stream_model_accuracy(flags, folder_name, file_name) # these models are using bi-rnn, so they are non streamable by default # also models using striding or pooling are not supported for streaming now non_streamable_models = {'att_mh_rnn', 'att_rnn', 'tc_resnet'} model_is_streamable = True if flags.model_name in non_streamable_models: model_is_streamable = False # below models can use striding in time dimension, # but this is currently unsupported elif flags.model_name == 'cnn': for strides in model_utils.parse(flags.cnn_strides): if strides[0] > 1: model_is_streamable = False break elif flags.model_name == 'ds_cnn': if model_utils.parse(flags.cnn1_strides)[0] > 1: model_is_streamable = False for strides in model_utils.parse(flags.dw2_strides): if strides[0] > 1: model_is_streamable = False break # set input data shape for testing inference in streaming mode flags.data_shape = modes.get_input_data_shape( flags, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE) # if model can be streamed, then run conversion/evaluation in streaming mode if model_is_streamable: # ---------------- TF streaming model accuracy evaluation ---------------- # Streaming model with external state evaluation using TF with state reset if not opt_name: logging.info( 'run TF evalution only without optimization/quantization') try: folder_name = 'tf' test.tf_stream_state_external_model_accuracy( flags, folder_name, accuracy_name= 'stream_state_external_model_accuracy_sub_set_reset1.txt', reset_state=True ) # with state reset between test sequences # Streaming (with external state) evaluation using TF no state reset test.tf_stream_state_external_model_accuracy( flags, folder_name, accuracy_name= 'stream_state_external_model_accuracy_sub_set_reset0.txt', reset_state=False) # without state reset # Streaming (with internal state) evaluation using TF no state reset test.tf_stream_state_internal_model_accuracy( flags, folder_name) except (ValueError, IndexError) as e: logging.info('FAILED to run TF streaming: %s', e) logging.info('run TFlite streaming model accuracy evaluation') try: # convert model to TFlite folder_name = opt_name + 'tflite_stream_state_external' file_name = 'stream_state_external.tflite' mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE test.convert_model_tflite(flags, folder_name, mode, file_name, optimizations=optimizations) # Streaming model accuracy evaluation with TFLite with state reset test.tflite_stream_state_external_model_accuracy( flags, folder_name, file_name, accuracy_name= 'tflite_stream_state_external_model_accuracy_reset1.txt', reset_state=True) # Streaming model accuracy evaluation with TFLite without state reset test.tflite_stream_state_external_model_accuracy( flags, folder_name, file_name, accuracy_name= 'tflite_stream_state_external_model_accuracy_reset0.txt', reset_state=False) except (ValueError, IndexError) as e: logging.info('FAILED to run TFLite streaming: %s', e)
def __init__(self, batch_size=512, version=1, preprocess="raw"): # Set PATH to data sets (for example to speech commands V2): # They can be downloaded from # https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz # https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz # https://docs.google.com/uc?export=download&id=1OAN3h4uffi5HS7eb7goklWeI2XPm1jCS # Files should be downloaded then extracted in the google-speech-commands directory dataset = "google-speech-commands" DATA_PATH = os.path.join("data", dataset, "data{}".format(version)) FLAGS = model_params.Params() FLAGS.data_dir = DATA_PATH FLAGS.verbosity = logging.ERROR # set wanted words for V2_35 dataset if version == 3: FLAGS.wanted_words = 'visual,wow,learn,backward,dog,two,left,happy,nine,go,up,bed,stop,one,zero,tree,seven,on,four,bird,right,eight,no,six,forward,house,marvin,sheila,five,off,three,down,cat,follow,yes' FLAGS.split_data = 0 # set speech feature extractor properties FLAGS.mel_upper_edge_hertz = 7600 FLAGS.window_size_ms = 30.0 FLAGS.window_stride_ms = 10.0 FLAGS.mel_num_bins = 80 FLAGS.dct_num_features = 40 FLAGS.feature_type = 'mfcc_tf' FLAGS.preprocess = preprocess # for numerical correctness of streaming and non streaming models set it to 1 # but for real use case streaming set it to 0 FLAGS.causal_data_frame_padding = 0 FLAGS.use_tf_fft = True FLAGS.mel_non_zero_only = not FLAGS.use_tf_fft # data augmentation parameters FLAGS.resample = 0.15 FLAGS.time_shift_ms = 100 FLAGS.use_spec_augment = 1 FLAGS.time_masks_number = 2 FLAGS.time_mask_max_size = 25 FLAGS.frequency_masks_number = 2 FLAGS.frequency_mask_max_size = 7 FLAGS.pick_deterministically = 1 self.flags = model_flags.update_flags(FLAGS) import absl absl.logging.set_verbosity(self.flags.verbosity) self.flags.batch_size = batch_size self.time_shift_samples = int((self.flags.time_shift_ms * self.flags.sample_rate) / 1000) tf1.disable_eager_execution() config = tf1.ConfigProto(device_count={'GPU': 0}) self.sess = tf1.Session(config=config) # tf1.keras.backend.set_session(self.sess) self.audio_processor = input_data.AudioProcessor(self.flags)
def setUp(self): super(DsTcResnetTest, self).setUp() config = tf1.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf1.Session(config=config) tf1.keras.backend.set_session(self.sess) test_utils.set_seed(123) tf.keras.backend.set_learning_phase(0) # model parameters model_name = 'ds_tc_resnet' self.params = model_params.HOTWORD_MODEL_PARAMS[model_name] self.params.clip_duration_ms = 160 self.params.window_size_ms = 4.0 self.params.window_stride_ms = 2.0 self.params.wanted_words = 'a,b,c' self.params.ds_padding = "'causal','causal','causal'" self.params.ds_filters = '8,8,4' self.params.ds_repeat = '1,1,1' self.params.ds_residual = '0,1,1' # residual can not be applied with stride self.params.ds_kernel_size = '3,3,3' self.params.ds_stride = '2,1,1' # streaming conv with stride self.params.ds_dilation = '1,1,1' self.params.ds_pool = '1,2,1' # streaming conv with pool self.params.ds_filter_separable = '1,1,1' # convert ms to samples and compute labels count self.params = model_flags.update_flags(self.params) # compute total stride pools = utils.parse(self.params.ds_pool) strides = utils.parse(self.params.ds_stride) time_stride = [1] for pool in pools: if pool > 1: time_stride.append(pool) for stride in strides: if stride > 1: time_stride.append(stride) total_stride = np.prod(time_stride) # overide input data shape for streaming model with stride/pool self.params.data_stride = total_stride self.params.data_frame_padding = 'causal' # set desired number of frames in model frames_number = 16 frames_per_call = total_stride frames_number = (frames_number // frames_per_call) * frames_per_call # number of input audio samples required to produce one output frame framing_stride = max( self.params.window_stride_samples, max(0, self.params.window_size_samples - self.params.window_stride_samples)) signal_size = framing_stride * frames_number # desired number of samples in the input data to train non streaming model self.params.desired_samples = signal_size self.params.batch_size = 1 self.model = ds_tc_resnet.model(self.params) self.model.summary() self.input_data = np.random.rand(self.params.batch_size, self.params.desired_samples) # run non streaming inference self.non_stream_out = self.model.predict(self.input_data)