def main(unused_argv): # Get input data location from flags. If we're reading a TFDS dataset, get # train, validation, and test. input_filenames_list, output_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags( FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset, FLAGS.output_filename) # Check that inputs and flags are formatted correctly. audio_to_embeddings_beam_utils.validate_inputs(input_filenames_list, output_filenames, FLAGS.embedding_modules, FLAGS.embedding_names, FLAGS.module_output_keys) logging.info('Starting to create flume pipeline...') with beam.Pipeline() as root: for i, (input_filenames, output_filename) in enumerate( zip(input_filenames_list, output_filenames)): audio_to_embeddings_beam_utils.make_beam_pipeline( root, input_filenames, sample_rate, FLAGS.debug, FLAGS.embedding_names, FLAGS.embedding_modules, FLAGS.module_output_keys, FLAGS.audio_key, FLAGS.sample_rate_key, FLAGS.label_key, FLAGS.speaker_id_key, FLAGS.average_over_time, FLAGS.delete_audio_from_output, output_filename, suffix=i)
def main(unused_argv): # Get input data location from flags. If we're reading a TFDS dataset, get # train, validation, and test. input_filenames_list, output_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags( FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset, FLAGS.output_filename, FLAGS.tfds_data_dir) # Check that inputs and flags are formatted correctly. audio_to_embeddings_beam_utils.validate_inputs(input_filenames_list, output_filenames, FLAGS.embedding_modules, FLAGS.embedding_names, FLAGS.module_output_keys) input_format = 'tfrecord' output_format = 'tfrecord' # If you have custom beam options, add them here. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: for i, (input_filenames_or_glob, output_filename) in enumerate( zip(input_filenames_list, output_filenames)): audio_to_embeddings_beam_utils.make_beam_pipeline( root, input_filenames_or_glob, sample_rate, FLAGS.debug, FLAGS.embedding_names, FLAGS.embedding_modules, FLAGS.module_output_keys, FLAGS.audio_key, FLAGS.sample_rate_key, FLAGS.label_key, FLAGS.speaker_id_key, FLAGS.average_over_time, FLAGS.delete_audio_from_output, output_filename, split_embeddings_into_separate_tables=FLAGS. split_embeddings_into_separate_tables, # pylint:disable=line-too-long use_frontend_fn=FLAGS.use_frontend_fn, model_input_min_length=FLAGS.model_input_min_length, input_format=input_format, output_format=output_format, suffix=i)
def main(unused_argv): # Get input data location fromm flags. input_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags( FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset) # Check that flags are formatted correctly. audio_to_embeddings_beam_utils.validate_inputs( FLAGS.output_filename, FLAGS.embedding_modules, FLAGS.embedding_names, FLAGS.module_output_keys) logging.info('Starting to create flume pipeline...') with beam.Pipeline() as root: audio_to_embeddings_beam_utils.make_beam_pipeline( root, input_filenames, sample_rate, FLAGS.debug, FLAGS.embedding_names, FLAGS.embedding_modules, FLAGS.module_output_keys, FLAGS.audio_key, FLAGS.sample_rate_key, FLAGS.label_key, FLAGS.speaker_id_key, FLAGS.average_over_time, FLAGS.delete_audio_from_output, FLAGS.output_filename)
def main(_): input_filenames_list, output_filenames, beam_params = utils.get_beam_params_from_flags( ) # Check that inputs and flags are formatted correctly. utils.validate_inputs(input_filenames_list=input_filenames_list, output_filenames=output_filenames, embedding_modules=beam_params['embedding_modules'], embedding_names=beam_params['embedding_names'], module_output_keys=beam_params['module_output_keys']) # If you have custom beam options, add them here. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: for i, (input_filenames_or_glob, output_filename) in enumerate( zip(input_filenames_list, output_filenames)): utils.make_beam_pipeline(root, input_filenames=input_filenames_or_glob, output_filename=output_filename, suffix=str(i), **beam_params)
def main(unused_argv): # Data prep setup. run_data_prep = True if FLAGS.train_input_glob: assert FLAGS.validation_input_glob assert FLAGS.test_input_glob input_filenames_list, output_filenames = [], [] for input_glob in [ FLAGS.train_input_glob, FLAGS.validation_input_glob, FLAGS.test_input_glob, ]: FLAGS.input_glob = input_glob cur_inputs, cur_outputs, beam_params = data_prep_utils.get_beam_params_from_flags( ) input_filenames_list.extend(cur_inputs) output_filenames.extend(cur_outputs) else: input_filenames_list, output_filenames, beam_params = data_prep_utils.get_beam_params_from_flags( ) assert input_filenames_list, input_filenames_list assert output_filenames, output_filenames try: # Check that inputs and flags are formatted correctly. data_prep_utils.validate_inputs( input_filenames_list, output_filenames, beam_params['embedding_modules'], beam_params['embedding_names'], beam_params['module_output_keys']) except ValueError: if FLAGS.skip_existing_error: run_data_prep = False else: raise logging.info('beam_params: %s', beam_params) # Generate sklearn eval experiment parameters based on data prep flags. if len(output_filenames) != 3: raise ValueError(f'Data prep output must be 3 files: {output_filenames}') # Make them globs. train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames] sklearn_results_output_file = FLAGS.results_output_file exp_params = sklearn_utils.experiment_params( embedding_list=beam_params['embedding_names'], speaker_id_name=FLAGS.speaker_id_key, label_name=FLAGS.label_key, label_list=FLAGS.label_list, train_glob=train_glob, eval_glob=eval_glob, test_glob=test_glob, save_model_dir=None, save_predictions_dir=None, eval_metric=FLAGS.eval_metric, ) logging.info('exp_params: %s', exp_params) # Make and run beam pipeline. beam_options = None if run_data_prep: logging.info('Data prep on: %s, %s...', input_filenames_list, output_filenames) with beam.Pipeline(beam_options) as root: for i, (input_filenames_or_glob, output_filename) in enumerate( zip(input_filenames_list, output_filenames)): data_prep_utils.make_beam_pipeline( root, input_filenames=input_filenames_or_glob, output_filename=output_filename, suffix=str(i), **beam_params) # Check that previous beam pipeline wrote outputs. sklearn_utils.validate_flags(train_glob, eval_glob, test_glob, sklearn_results_output_file) logging.info('Eval sklearn...') with beam.Pipeline(beam_options) as root: _ = ( root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map( lambda d: (d, sklearn_utils.train_and_get_score(**d))) | 'FormatText' >> beam.Map(sklearn_utils.format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText( sklearn_results_output_file, num_shards=1))