Пример #1
0
def main(unused_argv):
    # Get input data location from flags. If we're reading a TFDS dataset, get
    # train, validation, and test.
    input_filenames_list, output_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags(
        FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset,
        FLAGS.output_filename)

    # Check that inputs and flags are formatted correctly.
    audio_to_embeddings_beam_utils.validate_inputs(input_filenames_list,
                                                   output_filenames,
                                                   FLAGS.embedding_modules,
                                                   FLAGS.embedding_names,
                                                   FLAGS.module_output_keys)

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline() as root:
        for i, (input_filenames, output_filename) in enumerate(
                zip(input_filenames_list, output_filenames)):
            audio_to_embeddings_beam_utils.make_beam_pipeline(
                root,
                input_filenames,
                sample_rate,
                FLAGS.debug,
                FLAGS.embedding_names,
                FLAGS.embedding_modules,
                FLAGS.module_output_keys,
                FLAGS.audio_key,
                FLAGS.sample_rate_key,
                FLAGS.label_key,
                FLAGS.speaker_id_key,
                FLAGS.average_over_time,
                FLAGS.delete_audio_from_output,
                output_filename,
                suffix=i)
def main(unused_argv):

    # Get input data location from flags. If we're reading a TFDS dataset, get
    # train, validation, and test.
    input_filenames_list, output_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags(
        FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset,
        FLAGS.output_filename, FLAGS.tfds_data_dir)

    # Check that inputs and flags are formatted correctly.
    audio_to_embeddings_beam_utils.validate_inputs(input_filenames_list,
                                                   output_filenames,
                                                   FLAGS.embedding_modules,
                                                   FLAGS.embedding_names,
                                                   FLAGS.module_output_keys)

    input_format = 'tfrecord'
    output_format = 'tfrecord'

    # If you have custom beam options, add them here.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        for i, (input_filenames_or_glob, output_filename) in enumerate(
                zip(input_filenames_list, output_filenames)):
            audio_to_embeddings_beam_utils.make_beam_pipeline(
                root,
                input_filenames_or_glob,
                sample_rate,
                FLAGS.debug,
                FLAGS.embedding_names,
                FLAGS.embedding_modules,
                FLAGS.module_output_keys,
                FLAGS.audio_key,
                FLAGS.sample_rate_key,
                FLAGS.label_key,
                FLAGS.speaker_id_key,
                FLAGS.average_over_time,
                FLAGS.delete_audio_from_output,
                output_filename,
                split_embeddings_into_separate_tables=FLAGS.
                split_embeddings_into_separate_tables,  # pylint:disable=line-too-long
                use_frontend_fn=FLAGS.use_frontend_fn,
                model_input_min_length=FLAGS.model_input_min_length,
                input_format=input_format,
                output_format=output_format,
                suffix=i)
Пример #3
0
def main(unused_argv):
  # Get input data location fromm flags.
  input_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags(
      FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset)

  # Check that flags are formatted correctly.
  audio_to_embeddings_beam_utils.validate_inputs(
      FLAGS.output_filename, FLAGS.embedding_modules,
      FLAGS.embedding_names, FLAGS.module_output_keys)

  logging.info('Starting to create flume pipeline...')
  with beam.Pipeline() as root:
    audio_to_embeddings_beam_utils.make_beam_pipeline(
        root, input_filenames, sample_rate, FLAGS.debug, FLAGS.embedding_names,
        FLAGS.embedding_modules, FLAGS.module_output_keys, FLAGS.audio_key,
        FLAGS.sample_rate_key, FLAGS.label_key, FLAGS.speaker_id_key,
        FLAGS.average_over_time, FLAGS.delete_audio_from_output,
        FLAGS.output_filename)
def main(_):

    input_filenames_list, output_filenames, beam_params = utils.get_beam_params_from_flags(
    )
    # Check that inputs and flags are formatted correctly.
    utils.validate_inputs(input_filenames_list=input_filenames_list,
                          output_filenames=output_filenames,
                          embedding_modules=beam_params['embedding_modules'],
                          embedding_names=beam_params['embedding_names'],
                          module_output_keys=beam_params['module_output_keys'])

    # If you have custom beam options, add them here.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        for i, (input_filenames_or_glob, output_filename) in enumerate(
                zip(input_filenames_list, output_filenames)):
            utils.make_beam_pipeline(root,
                                     input_filenames=input_filenames_or_glob,
                                     output_filename=output_filename,
                                     suffix=str(i),
                                     **beam_params)
def main(unused_argv):

  # Data prep setup.
  run_data_prep = True
  if FLAGS.train_input_glob:
    assert FLAGS.validation_input_glob
    assert FLAGS.test_input_glob
    input_filenames_list, output_filenames = [], []
    for input_glob in [
        FLAGS.train_input_glob, FLAGS.validation_input_glob,
        FLAGS.test_input_glob,
    ]:
      FLAGS.input_glob = input_glob
      cur_inputs, cur_outputs, beam_params = data_prep_utils.get_beam_params_from_flags(
      )
      input_filenames_list.extend(cur_inputs)
      output_filenames.extend(cur_outputs)
  else:
    input_filenames_list, output_filenames, beam_params = data_prep_utils.get_beam_params_from_flags(
    )
  assert input_filenames_list, input_filenames_list
  assert output_filenames, output_filenames
  try:
    # Check that inputs and flags are formatted correctly.
    data_prep_utils.validate_inputs(
        input_filenames_list, output_filenames,
        beam_params['embedding_modules'], beam_params['embedding_names'],
        beam_params['module_output_keys'])
  except ValueError:
    if FLAGS.skip_existing_error:
      run_data_prep = False
    else:
      raise
  logging.info('beam_params: %s', beam_params)

  # Generate sklearn eval experiment parameters based on data prep flags.
  if len(output_filenames) != 3:
    raise ValueError(f'Data prep output must be 3 files: {output_filenames}')
  # Make them globs.
  train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames]
  sklearn_results_output_file = FLAGS.results_output_file
  exp_params = sklearn_utils.experiment_params(
      embedding_list=beam_params['embedding_names'],
      speaker_id_name=FLAGS.speaker_id_key,
      label_name=FLAGS.label_key,
      label_list=FLAGS.label_list,
      train_glob=train_glob,
      eval_glob=eval_glob,
      test_glob=test_glob,
      save_model_dir=None,
      save_predictions_dir=None,
      eval_metric=FLAGS.eval_metric,
  )
  logging.info('exp_params: %s', exp_params)

  # Make and run beam pipeline.
  beam_options = None

  if run_data_prep:
    logging.info('Data prep on: %s, %s...', input_filenames_list,
                 output_filenames)
    with beam.Pipeline(beam_options) as root:
      for i, (input_filenames_or_glob, output_filename) in enumerate(
          zip(input_filenames_list, output_filenames)):
        data_prep_utils.make_beam_pipeline(
            root,
            input_filenames=input_filenames_or_glob,
            output_filename=output_filename,
            suffix=str(i),
            **beam_params)

  # Check that previous beam pipeline wrote outputs.
  sklearn_utils.validate_flags(train_glob, eval_glob, test_glob,
                               sklearn_results_output_file)
  logging.info('Eval sklearn...')
  with beam.Pipeline(beam_options) as root:
    _ = (
        root
        | 'MakeCollection' >> beam.Create(exp_params)
        | 'CalcScores' >> beam.Map(
            lambda d: (d, sklearn_utils.train_and_get_score(**d)))
        | 'FormatText' >> beam.Map(sklearn_utils.format_text_line)
        | 'Reshuffle' >> beam.Reshuffle()
        | 'WriteOutput' >> beam.io.WriteToText(
            sklearn_results_output_file, num_shards=1))