def _train_and_get_score( train_glob, eval_glob, test_glob, embedding_name, label_name, speaker_id, ): """Wrapper for `train_and_get_score` that uses defaults.""" logging.info('Running `train_and_get_score` with %s, %s, %s, %s, %s', embedding_name, label_name, FLAGS.model_name, FLAGS.l2_normalization, speaker_id) score_dict = train_and_eval_sklearn.train_and_get_score( embedding_name=embedding_name, label_name=label_name, label_list=FLAGS.label_list, train_glob=train_glob, eval_glob=eval_glob, test_glob=test_glob, model_name=FLAGS.model_name, l2_normalization=FLAGS.l2_normalization, speaker_id_name=speaker_id, eval_metrics=[FLAGS.eval_metric]) assert len(score_dict) == 1, score_dict eval_score, test_score = list(score_dict.values())[0] return (eval_score, test_score)
def main(unused_argv): # Validate flags and setup directories. utils.validate_flags(FLAGS.train_glob, FLAGS.eval_glob, FLAGS.test_glob, FLAGS.output_file) # Generate experiment parameters based on flags. exp_params = utils.experiment_params( FLAGS.embedding_list, FLAGS.speaker_id_name, FLAGS.label_name, FLAGS.label_list, FLAGS.train_glob, FLAGS.eval_glob, FLAGS.test_glob, FLAGS.save_model_dir, FLAGS.save_predictions_dir, FLAGS.eval_metric, ) # Make and run beam pipeline. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map(lambda d: (d, utils.train_and_get_score(**d))) | 'FormatText' >> beam.Map(utils.format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
def main(unused_argv): # Data prep setup. prep_params, input_filenames_list, output_filenames, run_data_prep = _get_data_prep_params_from_flags( ) logging.info('beam_params: %s', prep_params) # Generate sklearn eval experiment parameters based on data prep flags. # Make (data_prep outputs / eval input filenames) globs. train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames] sklearn_results_output_file = FLAGS.results_output_file exp_params = sklearn_utils.experiment_params( train_glob=train_glob, eval_glob=eval_glob, test_glob=test_glob, embedding_list=prep_params['embedding_names'], speaker_id_name=FLAGS.speaker_id_key, label_name=FLAGS.label_key, label_list=FLAGS.label_list, save_model_dir=FLAGS.save_model_dir, save_predictions_dir=FLAGS.save_predictions_dir, eval_metrics=FLAGS.eval_metrics, ) logging.info('exp_params: %s', exp_params) # Make and run beam pipeline. beam_options = None if run_data_prep: input_filenames_list, output_filenames = _remove_existing_outputs( input_filenames_list, output_filenames) logging.info('Data prep on: %s, %s...', input_filenames_list, output_filenames) with beam.Pipeline(beam_options) as root: for i, (input_filenames_or_glob, output_filename) in enumerate( zip(input_filenames_list, output_filenames)): utils.data_prep_pipeline( root=root, input_filenames_or_glob=input_filenames_or_glob, output_filename=output_filename, data_prep_behavior=FLAGS.data_prep_behavior, beam_params=prep_params, suffix=str(i)) # Check that previous beam pipeline wrote outputs. sklearn_utils.validate_flags(train_glob, eval_glob, test_glob, sklearn_results_output_file) logging.info('Eval sklearn...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map(lambda d: (d, sklearn_utils.train_and_get_score(**d))) | 'FormatText' >> beam.Map(sklearn_utils.format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText( sklearn_results_output_file, num_shards=1))
def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): def _params_dict(l2_normalization, speaker_id_name=FLAGS.speaker_id_name, elem=elem): return { 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, 'l2_normalization': l2_normalization, 'speaker_id_name': speaker_id_name, 'save_model_dir': FLAGS.save_model_dir, 'save_predictions_dir': FLAGS.save_predictions_dir, 'eval_metric': FLAGS.eval_metric, } exp_params.append(_params_dict(l2_normalization=True)) exp_params.append(_params_dict(l2_normalization=False)) if FLAGS.speaker_id_name is not None: exp_params.append( _params_dict(l2_normalization=True, speaker_id_name=None)) exp_params.append( _params_dict(l2_normalization=False, speaker_id_name=None)) # Make and run beam pipeline. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map(lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): exp_params.append({ 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, # Either L2 normalization or speaker normalization. You could try both # if you wanted. 'l2_normalization': FLAGS.speaker_id_name is None, 'speaker_id_name': FLAGS.speaker_id_name, 'save_model_dir': FLAGS.save_model_dir, 'calculate_equal_error_rate': FLAGS.calculate_equal_error_rate, }) # Make and run beam pipeline. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map(lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): exp_params.append({ 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, # Either L2 normalization or speaker normalization. You could try both # if you wanted. 'l2_normalization': FLAGS.speaker_id_name is None, 'speaker_id_name': FLAGS.speaker_id_name, }) # Make and run beam pipeline. p = beam.Pipeline() _ = (p | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map( lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1)) result = p.run() result.wait_until_finish()
def main(unused_argv): # Data prep setup. run_data_prep = True if FLAGS.train_input_glob: assert FLAGS.validation_input_glob assert FLAGS.test_input_glob input_filenames_list, output_filenames = [], [] for input_glob in [ FLAGS.train_input_glob, FLAGS.validation_input_glob, FLAGS.test_input_glob, ]: FLAGS.input_glob = input_glob cur_inputs, cur_outputs, beam_params = data_prep_utils.get_beam_params_from_flags( ) input_filenames_list.extend(cur_inputs) output_filenames.extend(cur_outputs) else: input_filenames_list, output_filenames, beam_params = data_prep_utils.get_beam_params_from_flags( ) assert input_filenames_list, input_filenames_list assert output_filenames, output_filenames try: # Check that inputs and flags are formatted correctly. data_prep_utils.validate_inputs( input_filenames_list, output_filenames, beam_params['embedding_modules'], beam_params['embedding_names'], beam_params['module_output_keys']) except ValueError: if FLAGS.skip_existing_error: run_data_prep = False else: raise logging.info('beam_params: %s', beam_params) # Generate sklearn eval experiment parameters based on data prep flags. if len(output_filenames) != 3: raise ValueError(f'Data prep output must be 3 files: {output_filenames}') # Make them globs. train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames] sklearn_results_output_file = FLAGS.results_output_file exp_params = sklearn_utils.experiment_params( embedding_list=beam_params['embedding_names'], speaker_id_name=FLAGS.speaker_id_key, label_name=FLAGS.label_key, label_list=FLAGS.label_list, train_glob=train_glob, eval_glob=eval_glob, test_glob=test_glob, save_model_dir=None, save_predictions_dir=None, eval_metric=FLAGS.eval_metric, ) logging.info('exp_params: %s', exp_params) # Make and run beam pipeline. beam_options = None if run_data_prep: logging.info('Data prep on: %s, %s...', input_filenames_list, output_filenames) with beam.Pipeline(beam_options) as root: for i, (input_filenames_or_glob, output_filename) in enumerate( zip(input_filenames_list, output_filenames)): data_prep_utils.make_beam_pipeline( root, input_filenames=input_filenames_or_glob, output_filename=output_filename, suffix=str(i), **beam_params) # Check that previous beam pipeline wrote outputs. sklearn_utils.validate_flags(train_glob, eval_glob, test_glob, sklearn_results_output_file) logging.info('Eval sklearn...') with beam.Pipeline(beam_options) as root: _ = ( root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map( lambda d: (d, sklearn_utils.train_and_get_score(**d))) | 'FormatText' >> beam.Map(sklearn_utils.format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText( sklearn_results_output_file, num_shards=1))