def _train_and_get_score(
    train_glob,
    eval_glob,
    test_glob,
    embedding_name,
    label_name,
    speaker_id,
):
    """Wrapper for `train_and_get_score` that uses defaults."""
    logging.info('Running `train_and_get_score` with %s, %s, %s, %s, %s',
                 embedding_name, label_name, FLAGS.model_name,
                 FLAGS.l2_normalization, speaker_id)
    score_dict = train_and_eval_sklearn.train_and_get_score(
        embedding_name=embedding_name,
        label_name=label_name,
        label_list=FLAGS.label_list,
        train_glob=train_glob,
        eval_glob=eval_glob,
        test_glob=test_glob,
        model_name=FLAGS.model_name,
        l2_normalization=FLAGS.l2_normalization,
        speaker_id_name=speaker_id,
        eval_metrics=[FLAGS.eval_metric])
    assert len(score_dict) == 1, score_dict
    eval_score, test_score = list(score_dict.values())[0]
    return (eval_score, test_score)
def main(unused_argv):
    # Validate flags and setup directories.
    utils.validate_flags(FLAGS.train_glob, FLAGS.eval_glob, FLAGS.test_glob,
                         FLAGS.output_file)

    # Generate experiment parameters based on flags.
    exp_params = utils.experiment_params(
        FLAGS.embedding_list,
        FLAGS.speaker_id_name,
        FLAGS.label_name,
        FLAGS.label_list,
        FLAGS.train_glob,
        FLAGS.eval_glob,
        FLAGS.test_glob,
        FLAGS.save_model_dir,
        FLAGS.save_predictions_dir,
        FLAGS.eval_metric,
    )

    # Make and run beam pipeline.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >> beam.Map(lambda d:
                                        (d, utils.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(utils.format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file,
                                                    num_shards=1))
def main(unused_argv):

    # Data prep setup.
    prep_params, input_filenames_list, output_filenames, run_data_prep = _get_data_prep_params_from_flags(
    )
    logging.info('beam_params: %s', prep_params)

    # Generate sklearn eval experiment parameters based on data prep flags.
    # Make (data_prep outputs / eval input filenames) globs.
    train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames]
    sklearn_results_output_file = FLAGS.results_output_file
    exp_params = sklearn_utils.experiment_params(
        train_glob=train_glob,
        eval_glob=eval_glob,
        test_glob=test_glob,
        embedding_list=prep_params['embedding_names'],
        speaker_id_name=FLAGS.speaker_id_key,
        label_name=FLAGS.label_key,
        label_list=FLAGS.label_list,
        save_model_dir=FLAGS.save_model_dir,
        save_predictions_dir=FLAGS.save_predictions_dir,
        eval_metrics=FLAGS.eval_metrics,
    )
    logging.info('exp_params: %s', exp_params)

    # Make and run beam pipeline.
    beam_options = None

    if run_data_prep:
        input_filenames_list, output_filenames = _remove_existing_outputs(
            input_filenames_list, output_filenames)
        logging.info('Data prep on: %s, %s...', input_filenames_list,
                     output_filenames)
        with beam.Pipeline(beam_options) as root:
            for i, (input_filenames_or_glob, output_filename) in enumerate(
                    zip(input_filenames_list, output_filenames)):
                utils.data_prep_pipeline(
                    root=root,
                    input_filenames_or_glob=input_filenames_or_glob,
                    output_filename=output_filename,
                    data_prep_behavior=FLAGS.data_prep_behavior,
                    beam_params=prep_params,
                    suffix=str(i))

    # Check that previous beam pipeline wrote outputs.
    sklearn_utils.validate_flags(train_glob, eval_glob, test_glob,
                                 sklearn_results_output_file)
    logging.info('Eval sklearn...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >>
             beam.Map(lambda d: (d, sklearn_utils.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(sklearn_utils.format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(
                 sklearn_results_output_file, num_shards=1))
def main(unused_argv):
    assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
    assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
    assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

    # Create output directory if it doesn't already exist.
    outdir = os.path.dirname(FLAGS.output_file)
    file_utils.MaybeMakeDirs(outdir)

    # Enumerate the configurations we want to run.
    exp_params = []
    model_names = models.get_sklearn_models().keys()
    for elem in itertools.product(*[FLAGS.embedding_list, model_names]):

        def _params_dict(l2_normalization,
                         speaker_id_name=FLAGS.speaker_id_name,
                         elem=elem):
            return {
                'embedding_name': elem[0],
                'model_name': elem[1],
                'label_name': FLAGS.label_name,
                'label_list': FLAGS.label_list,
                'train_glob': FLAGS.train_glob,
                'eval_glob': FLAGS.eval_glob,
                'test_glob': FLAGS.test_glob,
                'l2_normalization': l2_normalization,
                'speaker_id_name': speaker_id_name,
                'save_model_dir': FLAGS.save_model_dir,
                'save_predictions_dir': FLAGS.save_predictions_dir,
                'eval_metric': FLAGS.eval_metric,
            }

        exp_params.append(_params_dict(l2_normalization=True))
        exp_params.append(_params_dict(l2_normalization=False))
        if FLAGS.speaker_id_name is not None:
            exp_params.append(
                _params_dict(l2_normalization=True, speaker_id_name=None))
            exp_params.append(
                _params_dict(l2_normalization=False, speaker_id_name=None))

    # Make and run beam pipeline.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >>
             beam.Map(lambda d:
                      (d, train_and_eval_sklearn.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file,
                                                    num_shards=1))
示例#5
0
def main(unused_argv):
    assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
    assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
    assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

    # Create output directory if it doesn't already exist.
    outdir = os.path.dirname(FLAGS.output_file)
    file_utils.MaybeMakeDirs(outdir)

    # Enumerate the configurations we want to run.
    exp_params = []
    model_names = models.get_sklearn_models().keys()
    for elem in itertools.product(*[FLAGS.embedding_list, model_names]):
        exp_params.append({
            'embedding_name':
            elem[0],
            'model_name':
            elem[1],
            'label_name':
            FLAGS.label_name,
            'label_list':
            FLAGS.label_list,
            'train_glob':
            FLAGS.train_glob,
            'eval_glob':
            FLAGS.eval_glob,
            'test_glob':
            FLAGS.test_glob,
            # Either L2 normalization or speaker normalization. You could try both
            # if you wanted.
            'l2_normalization':
            FLAGS.speaker_id_name is None,
            'speaker_id_name':
            FLAGS.speaker_id_name,
            'save_model_dir':
            FLAGS.save_model_dir,
            'calculate_equal_error_rate':
            FLAGS.calculate_equal_error_rate,
        })

    # Make and run beam pipeline.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >>
             beam.Map(lambda d:
                      (d, train_and_eval_sklearn.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file,
                                                    num_shards=1))
示例#6
0
def main(unused_argv):
    assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
    assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
    assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

    # Create output directory if it doesn't already exist.
    outdir = os.path.dirname(FLAGS.output_file)
    file_utils.MaybeMakeDirs(outdir)

    # Enumerate the configurations we want to run.
    exp_params = []
    model_names = models.get_sklearn_models().keys()
    for elem in itertools.product(*[FLAGS.embedding_list, model_names]):
        exp_params.append({
            'embedding_name': elem[0],
            'model_name': elem[1],
            'label_name': FLAGS.label_name,
            'label_list': FLAGS.label_list,
            'train_glob': FLAGS.train_glob,
            'eval_glob': FLAGS.eval_glob,
            'test_glob': FLAGS.test_glob,
            # Either L2 normalization or speaker normalization. You could try both
            # if you wanted.
            'l2_normalization': FLAGS.speaker_id_name is None,
            'speaker_id_name': FLAGS.speaker_id_name,
        })

    # Make and run beam pipeline.
    p = beam.Pipeline()
    _ = (p
         | 'MakeCollection' >> beam.Create(exp_params)
         | 'CalcScores' >> beam.Map(
             lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d)))
         | 'FormatText' >> beam.Map(format_text_line)
         |
         'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
    result = p.run()
    result.wait_until_finish()
def main(unused_argv):

  # Data prep setup.
  run_data_prep = True
  if FLAGS.train_input_glob:
    assert FLAGS.validation_input_glob
    assert FLAGS.test_input_glob
    input_filenames_list, output_filenames = [], []
    for input_glob in [
        FLAGS.train_input_glob, FLAGS.validation_input_glob,
        FLAGS.test_input_glob,
    ]:
      FLAGS.input_glob = input_glob
      cur_inputs, cur_outputs, beam_params = data_prep_utils.get_beam_params_from_flags(
      )
      input_filenames_list.extend(cur_inputs)
      output_filenames.extend(cur_outputs)
  else:
    input_filenames_list, output_filenames, beam_params = data_prep_utils.get_beam_params_from_flags(
    )
  assert input_filenames_list, input_filenames_list
  assert output_filenames, output_filenames
  try:
    # Check that inputs and flags are formatted correctly.
    data_prep_utils.validate_inputs(
        input_filenames_list, output_filenames,
        beam_params['embedding_modules'], beam_params['embedding_names'],
        beam_params['module_output_keys'])
  except ValueError:
    if FLAGS.skip_existing_error:
      run_data_prep = False
    else:
      raise
  logging.info('beam_params: %s', beam_params)

  # Generate sklearn eval experiment parameters based on data prep flags.
  if len(output_filenames) != 3:
    raise ValueError(f'Data prep output must be 3 files: {output_filenames}')
  # Make them globs.
  train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames]
  sklearn_results_output_file = FLAGS.results_output_file
  exp_params = sklearn_utils.experiment_params(
      embedding_list=beam_params['embedding_names'],
      speaker_id_name=FLAGS.speaker_id_key,
      label_name=FLAGS.label_key,
      label_list=FLAGS.label_list,
      train_glob=train_glob,
      eval_glob=eval_glob,
      test_glob=test_glob,
      save_model_dir=None,
      save_predictions_dir=None,
      eval_metric=FLAGS.eval_metric,
  )
  logging.info('exp_params: %s', exp_params)

  # Make and run beam pipeline.
  beam_options = None

  if run_data_prep:
    logging.info('Data prep on: %s, %s...', input_filenames_list,
                 output_filenames)
    with beam.Pipeline(beam_options) as root:
      for i, (input_filenames_or_glob, output_filename) in enumerate(
          zip(input_filenames_list, output_filenames)):
        data_prep_utils.make_beam_pipeline(
            root,
            input_filenames=input_filenames_or_glob,
            output_filename=output_filename,
            suffix=str(i),
            **beam_params)

  # Check that previous beam pipeline wrote outputs.
  sklearn_utils.validate_flags(train_glob, eval_glob, test_glob,
                               sklearn_results_output_file)
  logging.info('Eval sklearn...')
  with beam.Pipeline(beam_options) as root:
    _ = (
        root
        | 'MakeCollection' >> beam.Create(exp_params)
        | 'CalcScores' >> beam.Map(
            lambda d: (d, sklearn_utils.train_and_get_score(**d)))
        | 'FormatText' >> beam.Map(sklearn_utils.format_text_line)
        | 'Reshuffle' >> beam.Reshuffle()
        | 'WriteOutput' >> beam.io.WriteToText(
            sklearn_results_output_file, num_shards=1))