def run_training(session, config=FLAGS): save_config(config.summaries_dir, config) train_files = get_tfrecord_files(config) batch_number = len(train_files) // config.batch_size logging.info('Total number of batches %d' % batch_number) params = tf.contrib.training.HParams( learning_rate=config.learning_rate, pkeep_conv=0.75, device=config.device, epoch=config.epoch, batch_size=config.batch_size, min_eval_frequency=500, train_steps=None, # Use train feeder until its empty eval_steps=1, # Use 1 step of evaluation feeder train_files=train_files ) run_config = tf.contrib.learn.RunConfig(model_dir=config.checkpoint_dir) learn_runner.run( experiment_fn=experiment_fn, # First-class function run_config=run_config, # RunConfig schedule="train", # What to run hparams=params # HParams )
def test_fail_invalid_hparams_type(self): run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR) with self.assertRaisesRegexp(ValueError, _INVALID_HPARAMS_ERR_MSG): learn_runner.run(build_experiment_for_run_config, run_config=run_config, schedule="local_run", hparams=["hparams"])
def main(argv=None): """Run a Tensorflow model on the Criteo dataset.""" env = json.loads(os.environ.get('TF_CONFIG', '{}')) # First find out if there's a task value on the environment variable. # If there is none or it is empty define a default one. task_data = env.get('task') or {'type': 'master', 'index': 0} argv = sys.argv if argv is None else argv args = create_parser().parse_args(args=argv[1:]) trial = task_data.get('trial') if trial is not None: output_dir = os.path.join(args.output_path, trial) else: output_dir = args.output_path # Do only evaluation if instructed so, or call Experiment's run. if args.eval_only_summary_filename: experiment = get_experiment_fn(args)(output_dir) # Note that evaluation here will appear as 'one_pass' in tensorboard. results = experiment.evaluate(delay_secs=0) # Converts numpy types to native types for json dumps. json_out = json.dumps( {key: value.tolist() for key, value in results.iteritems()}) with tf.Session(): tf.write_file(args.eval_only_summary_filename, json_out).run() else: learn_runner.run(experiment_fn=get_experiment_fn(args), output_dir=output_dir)
def main(): args_parser = argparse.ArgumentParser() args = parameters.initialise_arguments(args_parser) parameters.HYPER_PARAMS = hparam.HParams(**args.__dict__) # Set python level verbosity tf.logging.set_verbosity(args.verbosity) # Set C++ Graph Execution level verbosity os.environ['TF_CPP_MIN_LOG_LEVEL'] = str( tf.logging.__dict__[args.verbosity] / 10) # Directory to store output model and checkpoints output_dir = args.job_dir # Run the training job learn_runner.run(experiment.generate_experiment_fn( min_eval_frequency=args.min_eval_frequency, eval_delay_secs=args.eval_delay_secs, train_steps=args.train_steps, eval_steps=args.eval_steps, export_strategies=[ saved_model_export_utils.make_export_strategy( serving.SERVING_FUNCTIONS[args.export_format], exports_to_keep=1, default_output_alternative_key=None, ) ]), run_config=run_config.RunConfig(model_dir=output_dir), hparams=parameters.HYPER_PARAMS)
def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): """Runs an Estimator locally or distributed. Args: data_dir: The directory the data can be found in. model: The name of the model to use. output_dir: The directory to store outputs in. train_steps: The number of steps to run training for. eval_steps: The number of steps to run evaluation for. schedule: (str) The schedule to run. The value here must be the name of one of Experiment's methods. """ exp_fn = make_experiment_fn(data_dir=data_dir, model_name=model, train_steps=train_steps, eval_steps=eval_steps) # Create hparams and run_config run_config = create_run_config(output_dir) hparams = create_hparams(FLAGS.hparams_set, data_dir, passed_hparams=FLAGS.hparams) if is_chief(): save_metadata(output_dir, hparams) learn_runner.run(experiment_fn=exp_fn, schedule=schedule, run_config=run_config, hparams=hparams)
def main(_argv): """The entrypoint for the script""" if not FLAGS.output_dir: FLAGS.output_dir = tempfile.mkdtemp() learn_runner.run(experiment_fn=create_experiment, output_dir=FLAGS.output_dir, schedule=FLAGS.schedule)
def train(self): experiment_fn = self._generate_experiment_fn() hparams = HParams(**self.customer_params) learn_runner.run(experiment_fn, run_config=self._build_run_config(), hparams=hparams)
def main(argv=None): """Runs a Tensorflow model on the Iris dataset.""" args = parse_arguments(sys.argv if argv is None else argv) env = json.loads(os.environ.get('TF_CONFIG', '{}')) # First find out if there's a task value on the environment variable. # If there is none or it is empty define a default one. task_data = env.get('task') or {'type': 'master', 'index': 0} trial = task_data.get('trial') if trial is not None: output_dir = os.path.join(args.output_path, trial) else: output_dir = args.output_path learn_runner.run( experiment_fn=make_experiment_fn( train_data_paths=args.train_data_paths, eval_data_paths=args.eval_data_paths, metadata_path=args.metadata_path, max_steps=args.max_steps, layer1_size=args.layer1_size, layer2_size=args.layer2_size, learning_rate=args.learning_rate, epsilon=args.epsilon, batch_size=args.batch_size, eval_batch_size=args.eval_batch_size), output_dir=output_dir)
def train_and_evaluate(args): train_steps = int(0.5 + (1.0 * args["num_epochs"] * args["nusers"]) / args["batch_size"]) steps_in_epoch = int(0.5 + args["nusers"] / args["batch_size"]) print("Will train for {} steps, evaluating once every {} steps".format( train_steps, steps_in_epoch)) def experiment_fn(output_dir): return tf.contrib.learn.Experiment( tf.contrib.factorization.WALSMatrixFactorization( num_rows=args["nusers"], num_cols=args["nitems"], embedding_dimension=args["n_embeds"], model_dir=args["output_dir"]), train_input_fn=read_dataset(tf.estimator.ModeKeys.TRAIN, args), eval_input_fn=read_dataset(tf.estimator.ModeKeys.EVAL, args), train_steps=train_steps, eval_steps=1, min_eval_frequency=steps_in_epoch) from tensorflow.contrib.learn.python.learn import learn_runner learn_runner.run(experiment_fn=experiment_fn, output_dir=args["output_dir"]) batch_predict(args)
def main(_argv): """The entrypoint for the script""" # Parse YAML FLAGS FLAGS.hooks = _maybe_load_yaml(FLAGS.hooks) FLAGS.metrics = _maybe_load_yaml(FLAGS.metrics) FLAGS.model_params = _maybe_load_yaml(FLAGS.model_params) FLAGS.input_pipeline_train = _maybe_load_yaml(FLAGS.input_pipeline_train) FLAGS.input_pipeline_dev = _maybe_load_yaml(FLAGS.input_pipeline_dev) # Load flags from config file final_config = {} if FLAGS.config_paths: for config_path in FLAGS.config_paths.split(","): config_path = config_path.strip() if not config_path: continue config_path = os.path.abspath(config_path) tf.logging.info("Loading config from %s", config_path) with gfile.GFile(config_path.strip()) as config_file: config_flags = yaml.load(config_file) final_config = _deep_merge_dict( final_config, config_flags ) ###merge the flags and values from all the files into a dict tf.logging.info("Final Config:\n%s", yaml.dump( final_config)) ###print the flags and values read from all the files # Merge flags with config values for flag_key, flag_value in final_config.items( ): ###map the flags and values to FLAGS in the code if hasattr(FLAGS, flag_key) and isinstance(getattr(FLAGS, flag_key), dict): merged_value = _deep_merge_dict( flag_value, getattr(FLAGS, flag_key) ) ###merge the values has been defined and the new values from the config files setattr(FLAGS, flag_key, merged_value) elif hasattr(FLAGS, flag_key): setattr(FLAGS, flag_key, flag_value) else: tf.logging.warning("Ignoring config flag: %s", flag_key) if FLAGS.save_checkpoints_secs is None \ and FLAGS.save_checkpoints_steps is None: FLAGS.save_checkpoints_secs = 600 tf.logging.info("Setting save_checkpoints_secs to %d", FLAGS.save_checkpoints_secs) if not FLAGS.output_dir: FLAGS.output_dir = tempfile.mkdtemp() ###creat temporary files if not FLAGS.input_pipeline_train: raise ValueError("You must specify input_pipeline_train") if not FLAGS.input_pipeline_dev: raise ValueError("You must specify input_pipeline_dev") learn_runner.run(experiment_fn=create_experiment, output_dir=FLAGS.output_dir, schedule=FLAGS.schedule)
def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): """Runs an Estimator locally or distributed. This function chooses one of two paths to execute: 1. Running locally if schedule=="local_run". 3. Distributed training/evaluation otherwise. Args: data_dir: The directory the data can be found in. model: The name of the model to use. output_dir: The directory to store outputs in. train_steps: The number of steps to run training for. eval_steps: The number of steps to run evaluation for. schedule: (str) The schedule to run. The value here must be the name of one of Experiment's methods. """ exp_fn = make_experiment_fn( data_dir=data_dir, model_name=model, train_steps=train_steps, eval_steps=eval_steps) if schedule == "local_run": # Run the local demo. exp = exp_fn(output_dir) if exp.train_steps > 0 or exp.eval_steps > 0: tf.logging.info("Performing local training and evaluation.") exp.train_and_evaluate() decode(exp.estimator) else: # Perform distributed training/evaluation. learn_runner.run( experiment_fn=exp_fn, schedule=schedule, output_dir=output_dir)
def test_fail_output_dir_and_run_config_are_both_set(self): with self.assertRaisesRegexp( ValueError, _CANNOT_SET_BOTH_OUTPUT_DIR_AND_CONFIG_MSG): learn_runner.run(build_experiment, output_dir=_MODIR_DIR, schedule="simple_task", run_config=run_config_lib.RunConfig())
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) tf.logging.set_verbosity(tf.logging.INFO) learn_runner.run( experiment_fn=get_experiment_fn(args), output_dir=args.job_dir)
def main(): "Entrypoint for training." parser = argparse.ArgumentParser() parser.add_argument('--train-files', help='Training files pattern globstring', default='data/training/*.jpg') parser.add_argument('--eval-files', help='Evaluation files pattern globstring', default='data/validation/*.jpg') parser.add_argument( '--job-dir', help='Location to write checkpoints, summaries, and export models', required=True) parser.add_argument('--num-epochs', help='Maximum number of epochs on which to train', default=1, type=int) parser.add_argument('--batch-size', help='Batch size for training steps', type=int, default=128) args = parser.parse_args() tf.logging.set_verbosity(tf.logging.INFO) experiment_fn = generate_experiment_fn(train_files=args.train_files, eval_files=args.eval_files, batch_size=args.batch_size, num_epochs=args.num_epochs) learn_runner.run(experiment_fn, args.job_dir)
def train_and_evaluate(args): train_steps = int(0.5 + (1.0 * args['num_epochs'] * args['nusers']) / args['batch_size']) steps_in_epoch = int(0.5 + args['nusers'] / args['batch_size']) print('Will train for {} steps, evaluating once every {} steps'.format( train_steps, steps_in_epoch)) def experiment_fn(output_dir): return tf.contrib.learn.Experiment( tf.contrib.factorization.WALSMatrixFactorization( num_rows=args['nusers'], num_cols=args['nitems'], embedding_dimension=args['n_embeds'], model_dir=args['output_dir']), train_input_fn=read_dataset(tf.estimator.ModeKeys.TRAIN, args), eval_input_fn=read_dataset(tf.estimator.ModeKeys.EVAL, args), train_steps=train_steps, eval_steps=1, min_eval_frequency=steps_in_epoch, export_strategies=tf.contrib.learn.utils.saved_model_export_utils. make_export_strategy( serving_input_fn=create_serving_input_fn(args))) from tensorflow.contrib.learn.python.learn import learn_runner learn_runner.run(experiment_fn, args['output_dir']) batch_predict(args)
def train_and_evaluate(args): train_steps = int(0.5 + (1.0 * args['num_epochs'] * args['n_interactions']) / args['batch_size']) print('Will train for {} steps'.format(train_steps)) def experiment_fn(output_dir): return tf.contrib.learn.Experiment( tf.contrib.factorization.WALSMatrixFactorization( num_rows=args['n_users'], num_cols=args['n_items'], embedding_dimension=args['n_embeds'], model_dir=args['output_dir']), train_input_fn=read_dataset(args['train_path'], tf.estimator.ModeKeys.TRAIN, args), eval_input_fn=read_dataset(args['train_path'], tf.estimator.ModeKeys.EVAL, args), export_strategies=[ saved_model_export_utils.make_export_strategy( serving_input_fn, default_output_alternative_key=None, exports_to_keep=1) ], train_steps=train_steps, eval_steps=None) from tensorflow.contrib.learn.python.learn import learn_runner learn_runner.run(experiment_fn, args['output_dir'])
def run(data_dir, model, output_dir, train_steps, eval_steps, schedule): """Runs an Estimator locally or distributed. Args: data_dir: The directory the data can be found in. model: The name of the model to use. output_dir: The directory to store outputs in. train_steps: The number of steps to run training for. eval_steps: The number of steps to run evaluation for. schedule: (str) The schedule to run. The value here must be the name of one of Experiment's methods. """ exp_fn = make_experiment_fn( data_dir=data_dir, model_name=model, train_steps=train_steps, eval_steps=eval_steps) # Create hparams and run_config run_config = create_run_config(output_dir) hparams = create_hparams( FLAGS.hparams_set, data_dir, passed_hparams=FLAGS.hparams) if is_chief(): save_metadata(output_dir, hparams) learn_runner.run( experiment_fn=exp_fn, schedule=schedule, run_config=run_config, hparams=hparams)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--data-dir', default='data/counting_mnist/', help='Counting MNIST data directory') parser.add_argument('--batch-size', default=32, type=int, help='Batch size') parser.add_argument('--learning-rate', default=1e-4, type=float, help='Learning rate') parser.add_argument('--train-steps', default=100000, type=int, help='Maximum number of training steps') parser.add_argument('--seed', help='Random seed', type=int, default=random.randint(0, 2**32)) parser.add_argument('--job-dir', default='jobs/', help='Job directory') args, _ = parser.parse_known_args() if args.seed is not None: random.seed(args.seed) np.random.seed(args.seed) tf.logging.set_verbosity(tf.logging.INFO) experiment_fn = generate_experiment_fn(args) learn_runner.run(experiment_fn, args.job_dir)
def main(_): # Parse hparams from FLAGs. Format example is provided below. # --hparams="model__optimizer__learning_rate=0.1,model__min_kernel_size=3" hparams = default_hparams().parse(FLAGS.hparams) experiment_config = config_utils.load_experiment_config( FLAGS.experiment_config) if FLAGS.train_path is not None: experiment_config.train_sources[0] = FLAGS.train_path if FLAGS.eval_path is not None: experiment_config.eval_sources[0] = FLAGS.eval_path best_model_dir = get_best_model_dir(FLAGS.warm_start_from) experiment_fn = experiment.get_experiment_fn( experiment_config, warm_start_from=best_model_dir, train_steps=FLAGS.num_train_steps, eval_steps=FLAGS.num_eval_steps, continuous_eval_throttle_secs=FLAGS.continuous_eval_throttle_secs, eval_delay_secs=0) # To migrate to tf.estimator.RunConfig. run_config = learn_runner.EstimatorConfig( model_dir=FLAGS.output_dir, save_checkpoints_steps=FLAGS.save_checkpoints_steps, save_summary_steps=FLAGS.save_checkpoints_steps, keep_checkpoint_max=FLAGS.keep_checkpoint_max) learn_runner.run(experiment_fn=experiment_fn, run_config=run_config, hparams=hparams)
def main(_argv): if FLAGS.save_checkpoints_secs is None \ and FLAGS.save_checkpoints_steps is None: FLAGS.save_checkpoints_secs = 600 tf.logging.info("Setting save_checkpoints_secs to %d", FLAGS.save_checkpoints_secs) if not FLAGS.source_vocab_path or not FLAGS.target_vocab_path: raise ValueError( "You must specify source_vocab_path and target_vocab_path") if not FLAGS.output_dir: FLAGS.output_dir = tempfile.mkdtemp() if not FLAGS.source_files or not FLAGS.target_files: raise ValueError("You must specify source_path and target_path") FLAGS.source_files = FLAGS.source_files.strip().split(',') print(FLAGS.source_files) FLAGS.target_files = FLAGS.target_files.strip().split(',') if not FLAGS.dev_source_files or not FLAGS.dev_target_files: raise ValueError("You must specify dev_*_path") learn_runner.run(experiment_fn=create_experiment, output_dir=FLAGS.output_dir, schedule=FLAGS.schedule)
def main(): tf.logging.set_verbosity(tf.logging.DEBUG) parsed_args = get_parser().parse_args() session_config = tf.ConfigProto(allow_soft_placement=True) session_config.gpu_options.allow_growth = True run_config = RunConfig(session_config=session_config) run_config = run_config.replace(model_dir=get_model_dir(parsed_args)) params = HParams(learning_rate=parsed_args.lr, train_steps=parsed_args.train_steps, steps_per_eval=parsed_args.steps_per_eval, batch_size=parsed_args.batch_size, vgg_model_path=parsed_args.vgg_model_path, selector=parsed_args.selector, dropout=parsed_args.dropout, ctx2out=parsed_args.ctx2out, prev2out=parsed_args.prev2out, dataset=parsed_args.dataset, eval_steps=parsed_args.eval_steps, hard_attention=parsed_args.hard_attention, use_sampler=parsed_args.use_sampler, bin_size=14) learn_runner.run(experiment_fn=experiment_fn_inner, run_config=run_config, schedule="continuous_train_and_eval", hparams=params)
def main(): "Entrypoint for training." parser = argparse.ArgumentParser() parser.add_argument('--data-dir', help='Directory containing data', required=True) #default='CBT/data/records/') parser.add_argument('--dataset-id', help='Unique id identifying dataset', required=True) parser.add_argument( '--job-dir', help='Location to write checkpoints, summaries, and export models', required=True) parser.add_argument('--num-epochs', help='Maximum number of epochs on which to train', default=200, type=int) parser.add_argument('--lr-min', help='Minimum learning rate', default=2e-4, type=float) parser.add_argument('--lr-max', help='Maximum learning rate', default=1e-2, type=float) parser.add_argument('--lr-step-size', help='Learning rate step size (in epochs)', default=10, type=int) parser.add_argument('--grad-noise', help='Gradient noise scale', default=0.005, type=float) parser.add_argument('--gpu', help='GPU ID to use', default=0, type=int) parser.add_argument( '--general', help='Uses the general model instead of the simplified one.', action='store_true') args = parser.parse_args() os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' tf.logging.set_verbosity(tf.logging.INFO) print(args.general) experiment_fn = generate_experiment_fn( data_dir=args.data_dir, dataset_id=args.dataset_id, num_epochs=args.num_epochs, learning_rate_min=args.lr_min, learning_rate_max=args.lr_max, learning_rate_step_size=args.lr_step_size, gradient_noise_scale=args.grad_noise, is_general=args.general) learn_runner.run(experiment_fn, args.job_dir)
def test_fail_hparams_are_set(self): hparams = _HPARAMS with self.assertRaisesRegexp( ValueError, _HPARAMS_CANNOT_BE_SET_FOR_OUTPUT_DIR_MSG): learn_runner.run(build_experiment, _MODIR_DIR, schedule="simple_task", hparams=hparams)
def main(argv=None): """Run a Tensorflow model on the Iris dataset.""" args = parse_arguments(sys.argv if argv is None else argv) tf.logging.set_verbosity(tf.logging.INFO) learn_runner.run( experiment_fn=get_experiment_fn(args), output_dir=args.job_dir)
def main(unused_argv): tf.flags.mark_flag_as_required('model_dir') tf.flags.mark_flag_as_required('pipeline_config_path') config = tf.contrib.learn.RunConfig(model_dir=FLAGS.model_dir) learn_runner.run(experiment_fn=build_experiment_fn(FLAGS.num_train_steps, FLAGS.num_eval_steps), run_config=config, hparams=model_hparams.create_hparams())
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) local_analysis(args) set_logging_level(args) # Supress TensorFlow Debugging info. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' learn_runner.run(experiment_fn=get_experiment_fn(args), output_dir=args.job_dir)
def main(unused_argv): tf.flags.mark_flag_as_required('model_dir') tf.flags.mark_flag_as_required('pipeline_config_path') config = tf.contrib.learn.RunConfig(model_dir=FLAGS.model_dir) learn_runner.run( experiment_fn=build_experiment_fn(FLAGS.num_train_steps, FLAGS.num_eval_steps), run_config=config, hparams=model_hparams.create_hparams())
def test_fail_not_experiment(self): def _experiment_fn(run_config, hparams): del run_config, hparams # unused. return "not experiment" run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR) with self.assertRaisesRegexp(TypeError, _NOT_EXP_TYPE_MSG): learn_runner.run(_experiment_fn, run_config=run_config, schedule="simple_task")
def main(_argv): """Main function """ schedules = ['train', 'evaluate', 'continuous_eval'] assert FLAGS.schedule in schedules,\ "Only schedules: %s supported!"%(','.join(schedules)) learn_runner.run(experiment_fn=_create_experiment, output_dir=FLAGS.output_dir, schedule=FLAGS.schedule)
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) local_analysis(args) set_logging_level(args) # Supress TensorFlow Debugging info. os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' learn_runner.run( experiment_fn=get_experiment_fn(args), output_dir=args.job_dir)
def main(): "Entrypoint for training." parser = argparse.ArgumentParser() parser.add_argument( '--data-dir', help='Directory containing data', default='data/babi/records/') parser.add_argument( '--dataset-id', help='Unique id identifying dataset', required=True) parser.add_argument( '--job-dir', help='Location to write checkpoints, summaries, and export models', required=True) parser.add_argument( '--num-epochs', help='Maximum number of epochs on which to train', default=200, type=int) parser.add_argument( '--lr-min', help='Minimum learning rate', default=2e-4, type=float) parser.add_argument( '--lr-max', help='Maximum learning rate', default=1e-2, type=float) parser.add_argument( '--lr-step-size', help='Learning rate step size (in epochs)', default=10, type=int) parser.add_argument( '--grad-noise', help='Gradient noise scale', default=0.005, type=float) args = parser.parse_args() tf.logging.set_verbosity(tf.logging.INFO) experiment_fn = generate_experiment_fn( data_dir=args.data_dir, dataset_id=args.dataset_id, num_epochs=args.num_epochs, learning_rate_min=args.lr_min, learning_rate_max=args.lr_max, learning_rate_step_size=args.lr_step_size, gradient_noise_scale=args.grad_noise) learn_runner.run(experiment_fn, args.job_dir)
def train_and_eval(job_dir=None): print("Begin training and evaluation") # if local eval and no args passed, default if job_dir is None: job_dir = 'models/' # Ensure path has a '/' at the end if job_dir[-1] != '/': job_dir += '/' gcs_base = 'https://storage.googleapis.com/' # No need to change # small_version, medium_version, large_version # Note: large_version is 2.7GB and medium_version is 273MB gcs_path = 'dataset-uploader/criteo-kaggle/small_version/' # Path to the folder with the files trainfile = 'train.csv' testfile = 'eval.csv' local_path = 'dataset_files' train_file = base.maybe_download(trainfile, local_path, gcs_base + gcs_path + trainfile) test_file = base.maybe_download(testfile, local_path, gcs_base + gcs_path + testfile) training_mode = 'learn_runner' train_steps = 1000 test_steps = 100 model_type = 'DEEP' model_dir = job_dir + 'model_' + model_type + '_' + str(int(time.time())) print("Saving model checkpoints to " + model_dir) export_dir = model_dir + '/exports' # Manually train and export model if training_mode == 'manual': # In this function, editing below here is unlikely to be needed m = build_estimator(model_type, model_dir) m.fit(input_fn=generate_input_fn(train_file), steps=train_steps) print('fit done') results = m.evaluate(input_fn=generate_input_fn(test_file), steps=test_steps) print('evaluate done') print('Accuracy: %s' % results['accuracy']) export_folder = m.export_savedmodel(export_dir_base=export_dir, input_fn=serving_input_fn) print('Model exported to ' + export_dir) elif training_mode == 'learn_runner': # use learn_runner experiment_fn = generate_experiment(model_dir, train_file, test_file, model_type) learn_runner.run(experiment_fn, model_dir)
def main(_argv): """The entrypoint for the script""" # Parse YAML FLAGS FLAGS.hooks = _maybe_load_yaml(FLAGS.hooks) FLAGS.metrics = _maybe_load_yaml(FLAGS.metrics) FLAGS.model_params = _maybe_load_yaml(FLAGS.model_params) FLAGS.input_pipeline_train = _maybe_load_yaml(FLAGS.input_pipeline_train) FLAGS.input_pipeline_dev = _maybe_load_yaml(FLAGS.input_pipeline_dev) # Load flags from config file final_config = {} if FLAGS.config_paths: for config_path in FLAGS.config_paths.split(","): config_path = config_path.strip() if not config_path: continue config_path = os.path.abspath(config_path) tf.logging.info("Loading config from %s", config_path) with gfile.GFile(config_path.strip()) as config_file: config_flags = yaml.load(config_file) final_config = _deep_merge_dict(final_config, config_flags) tf.logging.info("Final Config:\n%s", yaml.dump(final_config)) # Merge flags with config values for flag_key, flag_value in final_config.items(): if hasattr(FLAGS, flag_key) and isinstance(getattr(FLAGS, flag_key), dict): merged_value = _deep_merge_dict(flag_value, getattr(FLAGS, flag_key)) setattr(FLAGS, flag_key, merged_value) elif hasattr(FLAGS, flag_key): setattr(FLAGS, flag_key, flag_value) else: tf.logging.warning("Ignoring config flag: %s", flag_key) if FLAGS.save_checkpoints_secs is None \ and FLAGS.save_checkpoints_steps is None: FLAGS.save_checkpoints_secs = 600 tf.logging.info("Setting save_checkpoints_secs to %d", FLAGS.save_checkpoints_secs) if not FLAGS.output_dir: FLAGS.output_dir = tempfile.mkdtemp() if not FLAGS.input_pipeline_train: raise ValueError("You must specify input_pipeline_train") if not FLAGS.input_pipeline_dev: raise ValueError("You must specify input_pipeline_dev") learn_runner.run( experiment_fn=create_experiment, output_dir=FLAGS.output_dir, schedule=FLAGS.schedule)
def main(_): if not FLAGS.vocab_size: FLAGS.vocab_size = len(open(FLAGS.vocab_file).readlines()) if FLAGS.fast: FastTrain() elif FLAGS.train_records: if FLAGS.export_dir: tf.logging.warn( "Exporting savedmodels not supported for contrib experiment, --nofast" ) learn_runner.run(experiment_fn=Experiment, output_dir=FLAGS.model_dir)
def main(argv=None): hparams = HParams(batch_size=128, hidden_units=[256], learning_rate=.001) output_dir = 'test' config = learn.RunConfig(save_checkpoints_secs=600, model_dir=output_dir, gpu_memory_fraction=1) learn_runner.run(experiment_fn=_experiment_fn, run_config=config, hparams=hparams)
def test_basic_run_config_uid_check(self): expected_run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR) def _experiment_fn(run_config, hparams): del run_config, hparams # unused. # Explicitly use a new run_config. new_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR + "/123") return TestExperiment(config=new_config) with self.assertRaisesRegexp(RuntimeError, _RUN_CONFIG_UID_CHECK_ERR_MSG): learn_runner.run(experiment_fn=_experiment_fn, run_config=expected_run_config)
def main(args): env = json.loads(os.environ.get('TF_CONFIG', '{}')) # Print the job data as provided by the service. logging.info('Original job data: %s', env.get('job', {})) # First find out if there's a task value on the environment variable. # If there is none or it is empty define a default one. task_data = env.get('task', {'type': 'master', 'index': 0}) trial = task_data.get('trial') if trial is not None: args.output_path = os.path.join(args.output_path, trial) learn_runner.run(make_experiment_fn(args), args.output_path)
def test_fail_invalid_experiment_config_type(self): expected_run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR) def _experiment_fn(run_config, hparams): del run_config, hparams # unused. # Explicitly use a new run_config without `uid` method. new_config = core_run_config_lib.RunConfig( model_dir=_MODIR_DIR + "/123") return TestExperiment(config=new_config) with self.assertRaisesRegexp(RuntimeError, _MISSING_RUN_CONFIG_UID_ERR_MSG): learn_runner.run(experiment_fn=_experiment_fn, run_config=expected_run_config)
def main(argv): parser = argparse.ArgumentParser() # You must accept a --job-dir argument when running on Cloud ML Engine. It specifies where checkpoints should be saved. # You can define additional user arguments which will have to be specified after an empty arg -- on the command line: # gcloud ml-engine jobs submit training jobXXX --job-dir=... --ml-engine-args -- --user-args parser.add_argument('--job-dir', default="checkpoints", help='GCS or local path where to store training checkpoints') args = parser.parse_args() arguments = args.__dict__ arguments['data'] = "data" # Hard-coded here: training data will be downloaded to folder 'data'. # learn_runner needs an experiment function with a single parameter: the output directory. # Here we pass additional command line arguments through a closure. output_dir = arguments.pop('job_dir') experiment_fn = lambda output_dir: experiment_fn_with_params(output_dir, **arguments) learn_runner.run(experiment_fn, output_dir)
def test_run_with_explicit_local_run(self): run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR) self.assertEqual( "local_run-" + _MODIR_DIR, learn_runner.run(build_experiment_for_run_config, run_config=run_config, schedule="local_run"))
def test_schedule_from_tf_config_runs_train_on_worker(self): os.environ["TF_CONFIG"] = json.dumps( {"cluster": build_distributed_cluster_spec(), "task": {"type": tf.contrib.learn.TaskType.WORKER}} ) # RunConfig constructor will set job_name from TF_CONFIG. config = run_config.RunConfig() self.assertEqual("train", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_no_schedule_and_non_distributed_runs_train_and_evaluate(self): config = run_config.RunConfig( cluster_spec=build_non_distributed_cluster_spec()) self.assertEqual( "train_and_evaluate", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_schedule_from_tf_config_runs_serve_on_ps(self): tf_config = {"cluster": build_distributed_cluster_spec(), "task": {"type": tf.contrib.learn.TaskType.PS}} with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertEqual( "run_std_server", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp") )
def test_schedule_from_config_runs_train_on_worker(self): config = run_config.RunConfig( job_name="worker", cluster_spec=build_distributed_cluster_spec()) self.assertEqual( "train", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_run_with_custom_schedule(self): run_config = run_config_lib.RunConfig(model_dir=_MODIR_DIR) self.assertEqual( "simple_task, default=None.", learn_runner.run(build_experiment_for_run_config, run_config=run_config, schedule="simple_task"))
def test_schedule_from_tf_config(self): os.environ["TF_CONFIG"] = json.dumps({"task": {"type": "worker"}}) # RunConfig constructuor will set job_name from TF_CONFIG. config = run_config.RunConfig() self.assertEqual( "train", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def main(): env = json.loads(os.environ.get('TF_CONFIG', '{}')) # First find out if there's a task value on the environment variable. # If there is none or it is empty define a default one. task_data = env.get('task') or {'type': 'master', 'index': 0} args = parse_arguments() trial = task_data.get('trial') if trial is not None: output_dir = os.path.join(args.output_path, trial) else: output_dir = args.output_path learn_runner.run( experiment_fn=ExperimentFn(args), output_dir=output_dir)
def main(argv=None): """Run a Tensorflow model on the Reddit dataset.""" env = json.loads(os.environ.get('TF_CONFIG', '{}')) # First find out if there's a task value on the environment variable. # If there is none or it is empty define a default one. task_data = env.get('task') or {'type': 'master', 'index': 0} argv = sys.argv if argv is None else argv args = create_parser().parse_args(args=argv[1:]) trial = task_data.get('trial') if trial is not None: output_dir = os.path.join(args.output_path, trial) else: output_dir = args.output_path learn_runner.run(experiment_fn=get_experiment_fn(args), output_dir=output_dir)
def test_no_schedule_and_non_distributed_runs_train_and_evaluate(self): tf_config = {"cluster": build_non_distributed_cluster_spec()} with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertEqual( "train_and_evaluate", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def test_no_schedule_and_non_distributed_runs_train_and_evaluate(self): tf_config = {"cluster": build_non_distributed_cluster_spec()} with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config_lib.RunConfig() self.assertEqual( "train_and_evaluate-" + _MODIR_DIR, learn_runner.run( build_experiment_fn_for_output_dir(config), output_dir=_MODIR_DIR))
def train_and_eval(job_dir=None, model_type='WIDE_AND_DEEP'): print("Begin training and evaluation") # if local eval and no args passed, default if job_dir is None: job_dir = 'models/' # Ensure path has a '/' at the end if job_dir[-1] != '/': job_dir += '/' gcs_base = 'https://storage.googleapis.com/' gcs_path = 'cloudml-public/census/data/' trainfile = 'adult.data.csv' testfile = 'adult.test.csv' local_path = 'dataset_files' train_file = base.maybe_download( trainfile, local_path, gcs_base + gcs_path + trainfile) test_file = base.maybe_download( testfile, local_path, gcs_base + gcs_path + testfile) training_mode = 'learn_runner' train_steps = 1000 test_steps = 100 model_dir = job_dir + 'model_' + model_type + '_' + str(int(time.time())) print("Saving model checkpoints to " + model_dir) export_dir = model_dir + '/exports' # Manually train and export model if training_mode == 'manual': # In this function, editing below here is unlikely to be needed m = build_estimator(model_type, model_dir) m.fit(input_fn=generate_input_fn(train_file), steps=train_steps) print('fit done') results = m.evaluate(input_fn=generate_input_fn(test_file), steps=test_steps) print('evaluate done') print('Accuracy: %s' % results['accuracy']) export_folder = m.export_savedmodel( export_dir_base = export_dir, input_fn=serving_input_fn ) print('Model exported to ' + export_dir) elif training_mode == 'learn_runner': # use learn_runner experiment_fn = generate_experiment( model_dir, train_file, test_file, model_type) metrics, output_folder = learn_runner.run(experiment_fn, model_dir) print('Accuracy: {}'.format(metrics['accuracy'])) print('Model exported to {}'.format(output_folder))
def test_schedule_from_config_runs_train_and_evaluate_on_master(self): config = run_config.RunConfig( job_name="master", cluster_spec=build_distributed_cluster_spec(), task=0, is_chief=True) self.assertEqual( "train_and_evaluate", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))
def train_and_evaluate(args): train_steps = int(0.5 + (1.0 * args['num_epochs'] * args['nusers']) / args['batch_size']) steps_in_epoch = int(0.5 + args['nusers'] / args['batch_size']) print('Will train for {} steps, evaluating once every {} steps'.format(train_steps, steps_in_epoch)) def experiment_fn(output_dir): return tf.contrib.learn.Experiment( tf.contrib.factorization.WALSMatrixFactorization( num_rows=args['nusers'], num_cols=args['nitems'], embedding_dimension=args['n_embeds'], model_dir=args['output_dir']), train_input_fn=read_dataset(tf.estimator.ModeKeys.TRAIN, args), eval_input_fn=read_dataset(tf.estimator.ModeKeys.EVAL, args), train_steps=train_steps, eval_steps=1, min_eval_frequency=steps_in_epoch, export_strategies=tf.contrib.learn.utils.saved_model_export_utils.make_export_strategy(serving_input_fn=create_serving_input_fn(args)) ) from tensorflow.contrib.learn.python.learn import learn_runner learn_runner.run(experiment_fn, args['output_dir']) batch_predict(args)
def test_schedule_from_tf_config_runs_train_and_evaluate_on_master(self): tf_config = { "cluster": build_distributed_cluster_spec(), "task": { "type": run_config_lib.TaskType.MASTER } } with patch.dict("os.environ", {"TF_CONFIG": json.dumps(tf_config)}): config = run_config.RunConfig() self.assertEqual( "train_and_evaluate", learn_runner.run(lambda output_dir: TestExperiment(config=config), output_dir="/tmp"))