def test_compute_embedding_dofn(self, average_over_time, sample_rate_key, sample_rate): # Establish required key names. audio_key = 'audio_key' # Construct the tf.train.Example test data. ex = tf.train.Example() ex.features.feature[audio_key].float_list.value.extend( np.zeros(2000, np.float32)) if sample_rate_key: ex.features.feature[sample_rate_key].int64_list.value.append(8000) old_k = 'oldkey' do_fn = beam_dofns.ComputeEmbeddingMapFn( name='module_name', module='@loc', output_key='output_key', audio_key=audio_key, sample_rate_key=sample_rate_key, sample_rate=sample_rate, average_over_time=average_over_time, setup_fn=lambda _: FakeMod()) do_fn.setup() new_k, new_v = next(do_fn.process((old_k, ex))) self.assertEqual(new_k, old_k) expected_shape = (1, BASE_SHAPE_[1]) if average_over_time else BASE_SHAPE_ self.assertEqual(new_v.shape, expected_shape)
def test_compute_embedding_map_fn_tflite( self, average_over_time, sample_rate_key, sample_rate): # Establish required key names. audio_key = 'audio_key' # Construct the tf.train.Example test data. ex = tf.train.Example() ex.features.feature[audio_key].float_list.value.extend( np.zeros(2000, np.float32)) if sample_rate_key: ex.features.feature[sample_rate_key].int64_list.value.append(8000) old_k = 'oldkey' def _feature_fn(x, s): return tf.expand_dims( tf_frontend.compute_frontend_features(x, s, frame_hop=17), axis=-1).numpy().astype(np.float32) do_fn = beam_dofns.ComputeEmbeddingMapFn( name='module_name', module='file.tflite', output_key=0, audio_key=audio_key, sample_rate_key=sample_rate_key, sample_rate=sample_rate, average_over_time=average_over_time, feature_fn=_feature_fn, module_call_fn=_s2e, setup_fn=build_tflite_interpreter_dummy) do_fn.setup() new_k, new_v = next(do_fn.process((old_k, ex))) self.assertEqual(new_k, old_k) expected_shape = (1, BASE_SHAPE_[1]) if average_over_time else BASE_SHAPE_ self.assertEqual(new_v.shape, expected_shape)
def test_pipeline_padding(self, process_fn, chunk_len): """Check that the model input is of sufficient length.""" k, ex = 'key', make_tfexample(100) common_args = dict(name='name', module=None, output_key=['output_key'], audio_key='audio', sample_rate_key='sample_rate', sample_rate=None, average_over_time=True, model_input_min_length=400, setup_fn=lambda _: FakeMod()) if process_fn == 'ComputeEmbeddingMapFn': beam_dofn = beam_dofns.ComputeEmbeddingMapFn(**common_args) elif process_fn == 'ComputeMultipleEmbeddings': beam_dofn = beam_dofns.ComputeMultipleEmbeddingsFromSingleModel( embedding_names=['em1'], chunk_len=chunk_len, **common_args) elif process_fn == 'ChunkAudioAndComputeEmbeddings': beam_dofn = beam_dofns.ChunkAudioAndComputeEmbeddings( embedding_names=['em1'], chunk_len=chunk_len, **common_args) else: assert process_fn == 'ComputeBatchedChunkedSingleEmbeddings' beam_dofn = beam_dofns.ComputeBatchedChunkedSingleEmbeddings( **common_args) # Run preprocessing step. beam_dofn.setup() if process_fn == 'ComputeEmbeddingMapFn': model_input, sample_rate = beam_dofn.read_and_preprocess_audio( k, ex) expected_output_shape = (400, ) elif process_fn == 'ComputeBatchedChunkedSingleEmbeddings': model_input, _, sample_rate = beam_dofn.read_and_preprocess_batched_audio( [k, k], [ex, ex]) expected_output_shape = (2, 400) else: model_input, sample_rate = beam_dofn.tfex_to_chunked_audio(k, ex) expected_output_shape = (2, chunk_len) if chunk_len else (1, 400) # Original audio is too short, so it should be padded to # `model_input_min_length`. self.assertEqual(model_input.shape, expected_output_shape) # Having a non-standard sample rate should trigger resampling and cause the # output to be 16kHz. self.assertEqual(sample_rate, 16000)
def test_compute_embedding_dofn_custom_call(self, average_over_time, sample_rate_key, sample_rate): # Establish required key names. audio_key = 'audio_key' custom_call_shape = (5, 25) # Custom call function for embedding generation. def test_call_fn(audio_samples, sample_rate, module_location, output_key, name): """Mock waveform-to-embedding computation.""" del audio_samples, sample_rate, module_location, output_key, name return np.zeros(custom_call_shape, dtype=np.float32) # Construct the tf.train.Example test data. ex = tf.train.Example() ex.features.feature[audio_key].float_list.value.extend( np.zeros(2000, np.float32)) if sample_rate_key: ex.features.feature[sample_rate_key].int64_list.value.append(8000) old_k = 'oldkey' do_fn = beam_dofns.ComputeEmbeddingMapFn( name='module_name', module='@loc', output_key='unnecessary', audio_key=audio_key, sample_rate_key=sample_rate_key, sample_rate=sample_rate, average_over_time=average_over_time, module_call_fn=test_call_fn, setup_fn=lambda _: None) do_fn.setup() new_k, new_v = next(do_fn.process((old_k, ex))) self.assertEqual(new_k, old_k) expected_shape = ( 1, custom_call_shape[1]) if average_over_time else custom_call_shape self.assertEqual(new_v.shape, expected_shape)
def make_many_models_beam_pipeline( root, input_filenames, output_filename, sample_rate, debug, embedding_names, embedding_modules, module_output_keys, audio_key, sample_rate_key, label_key, speaker_id_key, average_over_time, delete_audio_from_output, split_embeddings_into_separate_tables=False, use_frontend_fn=False, normalize_to_pm_one=True, model_input_min_length=None, input_format='tfrecord', output_format='tfrecord', suffix='Main', module_call_fn=utils.samples_to_embedding_tfhub, setup_fn=hub.load): """Construct beam pipeline for mapping from audio to embeddings. Args: root: The beam root node. input_filenames: Python list. List of input files. output_filename: Python string. Output filename. sample_rate: Python int, or `None`. The sample rate for all embeddings, or `None` if this is a TFDS dataset, or if each example has its own sample rate. debug: Python bool. Whether to operate in debug mode. embedding_names: Python list of embeddings. embedding_modules: Python list of TF-Hub modules. module_output_keys: Python list of strings, names of output modules. audio_key: Python string, the key of the audio. sample_rate_key: Python string or `None`, the key for. label_key: Python string. Field for label. speaker_id_key: Python string or `None`. Key for speaker ID, or `None`. average_over_time: Python bool. If `True`, average over the time axis. delete_audio_from_output: Python bool. Whether to remove audio fromm outputs. split_embeddings_into_separate_tables: Python bool. If true, write each embedding to a separate table. use_frontend_fn: If `true`, call frontend fn on audio before passing to the model. normalize_to_pm_one: Whether to normalize input to +- 1 before passing to model. model_input_min_length: Min length to the model, or `None`. 0-pad inputs to this length, if necessary. Note that frontends usually contain their own length logic, unless the model is in TFLite format. input_format: Python string. Must correspond to a function in `reader_functions`. output_format: Python string. Must correspond to a function `writer_functions`. suffix: Python string. Suffix to stage names to make them unique. module_call_fn: Function for inference on audio. setup_fn: Function for creating audio inference model. """ tf_examples_key_ = 'tf_examples' if tf_examples_key_ in embedding_names: raise ValueError( f'"{tf_examples_key_}" is reserved, cannot be embedding name.') s = suffix # for code brevity. # Read from input. input_examples = _common_pipeline_beginning(root, input_format, input_filenames, s, debug) # Compute all the embeddings simultaneously. embedding_tables = {} for name, mod, out_key in zip(embedding_names, embedding_modules, module_output_keys): logging.info('Adding signal: %s %s, %s', name, mod, out_key) tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo( beam_dofns.ComputeEmbeddingMapFn( name=name, module=mod, output_key=out_key, audio_key=audio_key, sample_rate_key=sample_rate_key, sample_rate=sample_rate, average_over_time=average_over_time, feature_fn=(utils.default_feature_fn if use_frontend_fn else None), normalize_to_pm_one=normalize_to_pm_one, model_input_min_length=model_input_min_length, module_call_fn=module_call_fn, setup_fn=setup_fn)) embedding_tables[name] = tbl assert tf_examples_key_ not in embedding_tables embedding_tables[tf_examples_key_] = input_examples logging.info('embedding_tables: %s', embedding_tables) # Either write to one table with all embeddings, or one table per embedding. if split_embeddings_into_separate_tables: output_table_dicts = [(k, { k: v, tf_examples_key_: input_examples }) for k, v in embedding_tables.items() if k != tf_examples_key_] else: output_table_dicts = [('all', embedding_tables)] # Combine embeddings and tf.train.Example, using the common key. writer_function = utils.writer_functions[output_format] for name, embedding_tables in output_table_dicts: if split_embeddings_into_separate_tables: cur_s = f'{name}-{s}' # Add `name` as a subdir. dirname, basename = os.path.split(output_filename) cur_output_filename = os.path.join(dirname, name, f'{basename}@*') else: cur_s = s cur_output_filename = f'{output_filename}@*' combined_tbl = ( embedding_tables | f'CombineEmbeddingTables-{cur_s}' >> beam.CoGroupByKey() | f'AddEmbeddings-{cur_s}' >> beam.Map( utils.add_embeddings_to_tfex, original_example_key=tf_examples_key_, delete_audio_from_output=delete_audio_from_output, audio_key=audio_key, label_key=label_key, speaker_id_key=speaker_id_key)) logging.info('Writing to %s', cur_output_filename) writer_function(combined_tbl, cur_output_filename, cur_s)