def test_config_benchmark_file_logger(self): # Set the benchmark_log_dir first since the benchmark_logger_type will need # the value to be set when it does the validation. with flagsaver.flagsaver(benchmark_log_dir='/tmp'): with flagsaver.flagsaver(benchmark_logger_type='BenchmarkFileLogger'): logger.config_benchmark_logger() self.assertIsInstance(logger.get_benchmark_logger(), logger.BenchmarkFileLogger)
def test_config_benchmark_file_logger(self): # Set the benchmark_log_dir first since the benchmark_logger_type will need # the value to be set when it does the validation. with flagsaver.flagsaver(benchmark_log_dir="/tmp"): with flagsaver.flagsaver(benchmark_logger_type="BenchmarkFileLogger"): logger.config_benchmark_logger() self.assertIsInstance(logger.get_benchmark_logger(), logger.BenchmarkFileLogger)
def train(self): self.createDatasets(); if self.model is None: self._getModelEstimator(); estimator = self.model.getEstimator(); run_params = { 'batch_size': self.flags.batch_size, 'train_epochs': self.flags.train_epochs, 'model_type': 'deep', } benchmark_logger = logger.config_benchmark_logger(self.flags) benchmark_logger.log_run_info('deep', 'Readmission Patient', run_params) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(self.flags.train_epochs // self.flags.epochs_between_evals): estimator.train(input_fn=self._input_fn_train); # Display evaluation metrics tf.logging.info('Results at epoch %d / %d', (n + 1) * self.flags.epochs_between_evals, self.flags.train_epochs) tf.logging.info('-' * 60) results = estimator.predict(input_fn=self._input_fn_analyze); encodings = [p['encoding'] for p in results]; basic_encodings = np.array(encodings); filename_basic_encodings = self.flags.model_dir + '/basic_encodings_' + str(n).zfill(5) + '.npy' np.save(filename_basic_encodings, basic_encodings);
def train_boosted_trees(flags_obj): """Train boosted_trees estimator on HIGGS data. Args: flags_obj: An object containing parsed flag values. """ # Clean up the model directory if present. if tf.gfile.Exists(flags_obj.model_dir): tf.gfile.DeleteRecursively(flags_obj.model_dir) tf.logging.info("## Data loading...") train_data, eval_data = read_higgs_data( flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count, flags_obj.eval_start, flags_obj.eval_count) tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format( train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape)) # Data consists of one label column followed by 28 feature columns. train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays( features_np=train_data[:, 1:], label_np=train_data[:, 0:1]) eval_input_fn = make_eval_inputs_from_np_arrays( features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1]) tf.logging.info("## Features prepared. Training starts...") # Create benchmark logger to log info about the training and metric values run_params = { "train_start": flags_obj.train_start, "train_count": flags_obj.train_count, "eval_start": flags_obj.eval_start, "eval_count": flags_obj.eval_count, "n_trees": flags_obj.n_trees, "max_depth": flags_obj.max_depth, } benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info( model_name="boosted_trees", dataset_name="higgs", run_params=run_params) # Though BoostedTreesClassifier is under tf.estimator, faster in-memory # training is yet provided as a contrib library. classifier = tf.contrib.estimator.boosted_trees_classifier_train_in_memory( train_input_fn, feature_columns, model_dir=flags_obj.model_dir or None, n_trees=flags_obj.n_trees, max_depth=flags_obj.max_depth, learning_rate=flags_obj.learning_rate) # Evaluation. eval_results = classifier.evaluate(eval_input_fn) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Exporting the savedmodel with csv parsing. if flags_obj.export_dir is not None: classifier.export_savedmodel( flags_obj.export_dir, _make_csv_serving_input_receiver_fn( column_names=feature_names, # columns are all floats. column_defaults=[[0.0]] * len(feature_names)))
def run_wide_deep(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ # Clean up the model directory if present shutil.rmtree(flags_obj.model_dir, ignore_errors=True) model = build_estimator(flags_obj.model_dir, flags_obj.model_type) train_file = os.path.join(flags_obj.data_dir, 'adult.data') test_file = os.path.join(flags_obj.data_dir, 'adult.test') # Train and evaluate the model every `flags.epochs_between_evals` epochs. def train_input_fn(): return input_fn( train_file, flags_obj.epochs_between_evals, True, flags_obj.batch_size) def eval_input_fn(): return input_fn(test_file, 1, False, flags_obj.batch_size) run_params = { 'batch_size': flags_obj.batch_size, 'train_epochs': flags_obj.train_epochs, 'model_type': flags_obj.model_type, } benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info('wide_deep', 'Census Income', run_params) loss_prefix = LOSS_PREFIX.get(flags_obj.model_type, '') train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, batch_size=flags_obj.batch_size, tensors_to_log={'average_loss': loss_prefix + 'head/truediv', 'loss': loss_prefix + 'head/weighted_loss/Sum'}) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(flags_obj.train_epochs // flags_obj.epochs_between_evals): model.train(input_fn=train_input_fn, hooks=train_hooks) results = model.evaluate(input_fn=eval_input_fn) # Display evaluation metrics tf.logging.info('Results at epoch %d / %d', (n + 1) * flags_obj.epochs_between_evals, flags_obj.train_epochs) tf.logging.info('-' * 60) for key in sorted(results): tf.logging.info('%s: %s' % (key, results[key])) benchmark_logger.log_evaluation_result(results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, results['accuracy']): break # Export the model if flags_obj.export_dir is not None: export_model(model, flags_obj.model_type, flags_obj.export_dir)
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ # Determine training schedule based on flags. if flags_obj.train_steps is not None: train_eval_iterations = (flags_obj.train_steps // flags_obj.steps_between_evals) single_iteration_train_steps = flags_obj.steps_between_evals single_iteration_train_epochs = None else: train_epochs = flags_obj.train_epochs or DEFAULT_TRAIN_EPOCHS train_eval_iterations = train_epochs // flags_obj.epochs_between_evals single_iteration_train_steps = None single_iteration_train_epochs = flags_obj.epochs_between_evals # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] params.data_dir = flags_obj.data_dir params.num_parallel_calls = flags_obj.num_parallel_calls params.epochs_between_evals = flags_obj.epochs_between_evals params.repeat_dataset = single_iteration_train_epochs params.batch_size = flags_obj.batch_size or params.batch_size # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=params.batch_size # for ExamplesPerSecondHook ) benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info(model_name="transformer", dataset_name="wmt_translate_ende", run_params=params.__dict__) # Train and evaluate transformer model estimator = tf.estimator.Estimator(model_fn=model_fn, model_dir=flags_obj.model_dir, params=params) train_schedule( estimator=estimator, # Training arguments train_eval_iterations=train_eval_iterations, single_iteration_train_steps=single_iteration_train_steps, single_iteration_train_epochs=single_iteration_train_epochs, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file_path=os.path.join(flags_obj.data_dir, flags_obj.vocab_file))
def train(self): self.createDatasets() if self.model is None: self._getModelEstimator() estimator = self.model.getEstimator() run_params = { 'batch_size': self.flags.batch_size, 'train_epochs': self.flags.train_epochs, 'model_type': 'deep', } benchmark_logger = logger.config_benchmark_logger(self.flags) benchmark_logger.log_run_info('deep', 'Readmission Patient', run_params) # Train and evaluate the model every `flags.epochs_between_evals` epochs. for n in range(self.flags.train_epochs // self.flags.epochs_between_evals): # Break from loop if privacy budget is exceedeed and differential privacy is enabled if self.flags.enable_dp and self.model.is_privacy_budget_exceeded( ): break print('n: ' + str(n)) estimator.train(input_fn=self._input_fn_train) results = estimator.evaluate(input_fn=self._input_fn_eval) # Display evaluation metrics tf.logging.info('Results at epoch %d / %d', (n + 1) * self.flags.epochs_between_evals, self.flags.train_epochs) tf.logging.info('-' * 60) for key in sorted(results): tf.logging.info('%s: %s' % (key, results[key])) benchmark_logger.log_evaluation_result(results) if model_helpers.past_stop_threshold(self.flags.stop_threshold, results['accuracy']): break # Export the model print('export the model?') if n % 10 == 0 and self.flags.export_dir is not None: self.export_model()
def get_logging_metric_hook(benchmark_log_dir=None, tensors_to_log=None, every_n_secs=600, **kwargs): # pylint: disable=unused-argument """Function to get LoggingMetricHook. Args: benchmark_log_dir: `string`, directory path to save the metric log. tensors_to_log: List of tensor names or dictionary mapping labels to tensor names. If not set, log _TENSORS_TO_LOG by default. every_n_secs: `int`, the frequency for logging the metric. Default to every 10 mins. Returns: Returns a ProfilerHook that writes out timelines that can be loaded into profiling tools like chrome://tracing. """ logger.config_benchmark_logger(benchmark_log_dir) if tensors_to_log is None: tensors_to_log = _TENSORS_TO_LOG return metric_hook.LoggingMetricHook( tensors=tensors_to_log, metric_logger=logger.get_benchmark_logger(), every_n_secs=every_n_secs)
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) if flags_core.get_num_gpus(flags_obj) == 0: distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0') elif flags_core.get_num_gpus(flags_obj) == 1: distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.contrib.distribute.MirroredStrategy( num_gpus=flags_core.get_num_gpus(flags_obj)) run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info('resnet', dataset_name, run_params) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, batch_size=flags_obj.batch_size) def input_fn_train(num_epochs): return input_function( mode="train", data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=num_epochs, num_gpus=flags_core.get_num_gpus(flags_obj), dtype=flags_core.get_tf_dtype(flags_obj)) def input_fn_eval(): return input_function(mode="validate", data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) def input_fn_pred(): return input_function(mode="predict", data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) # if flags_obj.predict_only: result = classifier.predict(input_fn=lambda: input_fn_pred()) predicted_values = np.stack([r["predictions"] for r in result], axis=0) #print(predicted_values) df = pd.DataFrame(predicted_values) df.to_csv("validate_result.txt") return # train if flags_obj.eval_only or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [ flags_obj.epochs_between_evals for _ in range(int(n_loops)) ] schedule[-1] = flags_obj.train_epochs - sum( schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=100) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['mse']): break # save model for serving if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def test_config_base_benchmark_logger(self): with flagsaver.flagsaver(benchmark_logger_type="BaseBenchmarkLogger"): logger.config_benchmark_logger() self.assertIsInstance(logger.get_benchmark_logger(), logger.BaseBenchmarkLogger)
def simpnet_main(flags, model_function, input_function, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See SimpnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' if flags.multi_gpu: validate_batch_size_for_multi_gpu(flags.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( model_function, loss_reduction=tf.losses.Reduction.MEAN) # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) # Set up a RunConfig to save checkpoint and set session config. run_config = tf.estimator.RunConfig().replace( save_checkpoints_secs=1e9, session_config=session_config) classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'multi_gpu': flags.multi_gpu, 'loss_scale': flags.loss_scale, 'dtype': flags.dtype }) benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir) benchmark_logger.log_run_info('simpnet') train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) def input_fn_train(): return input_function(True, flags.data_dir, flags.batch_size, flags.epochs_between_evals, flags.num_parallel_calls, flags.multi_gpu) def input_fn_eval(): return input_function(False, flags.data_dir, flags.batch_size, 1, flags.num_parallel_calls, flags.multi_gpu) total_training_cycle = flags.train_epochs // flags.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags.max_train_steps) tf.logging.info('Starting to evaluate.') # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags.stop_threshold, eval_results['accuracy']): break if flags.export_dir is not None: warn_on_multi_gpu_export(flags.multi_gpu) # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags.batch_size) classifier.export_savedmodel(flags.export_dir, input_receiver_fn)
def resnet_main( flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) if flags_core.get_num_gpus(flags_obj) == 0: distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0') elif flags_core.get_num_gpus(flags_obj) == 1: distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.contrib.distribute.MirroredStrategy( num_gpus=flags_core.get_num_gpus(flags_obj) ) run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info('resnet', dataset_name, run_params) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, batch_size=flags_obj.batch_size) def input_fn_train(): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.epochs_between_evals) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def test_config_benchmark_bigquery_logger(self, mock_bigquery_client): with flagsaver.flagsaver(benchmark_logger_type='BenchmarkBigQueryLogger'): logger.config_benchmark_logger() self.assertIsInstance(logger.get_benchmark_logger(), logger.BenchmarkBigQueryLogger)
def test_config_base_benchmark_logger(self): logger.config_benchmark_logger("") self.assertIsInstance(logger.get_benchmark_logger(), logger.BaseBenchmarkLogger)
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) if flags_core.get_num_gpus(flags_obj) == 0: distribution = tf.contrib.distribute.OneDeviceStrategy('device:CPU:0') elif flags_core.get_num_gpus(flags_obj) == 1: distribution = tf.contrib.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.contrib.distribute.MirroredStrategy( num_gpus=flags_core.get_num_gpus(flags_obj)) run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config) classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj) }) run_params = { 'batch_size': flags_obj.batch_size, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } benchmark_logger = logger.config_benchmark_logger( flags_obj.benchmark_log_dir) benchmark_logger.log_run_info('resnet', dataset_name, run_params) train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, batch_size=flags_obj.batch_size, benchmark_log_dir=flags_obj.benchmark_log_dir) def input_fn_train(): return input_function(is_training=True, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.epochs_between_evals) def input_fn_eval(): return input_function(is_training=False, data_dir=flags_obj.data_dir, batch_size=per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1) total_training_cycle = (flags_obj.train_epochs // flags_obj.epochs_between_evals) for cycle_index in range(total_training_cycle): tf.logging.info('Starting a training cycle: %d/%d', cycle_index, total_training_cycle) classifier.train(input_fn=input_fn_train, hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags_obj.stop_threshold, eval_results['accuracy']): break if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def test_config_benchmark_file_logger(self): logger.config_benchmark_logger("/tmp/abc") self.assertIsInstance(logger.get_benchmark_logger(), logger.BenchmarkFileLogger)
def test_config_benchmark_file_logger(self): logger.config_benchmark_logger("/tmp/abc") self.assertIsInstance(logger.get_benchmark_logger(), logger.BenchmarkFileLogger)
def test_config_base_benchmark_logger(self): logger.config_benchmark_logger("") self.assertIsInstance(logger.get_benchmark_logger(), logger.BaseBenchmarkLogger)
def main(_): # Data preprocessing # The file name of training and test dataset train_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME) test_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME) neg_fname = os.path.join(FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME) assert os.path.exists(train_fname), ( "Run data_download.py first to download and extract {} dataset".format( FLAGS.dataset)) tf.logging.info("Data preprocessing...") ncf_dataset = dataset.data_preprocessing(train_fname, test_fname, neg_fname, FLAGS.num_neg) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF(ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.config_benchmark_logger(FLAGS) benchmark_logger.log_run_info(model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params) # Training and evaluation cycle def train_input_fn(): return dataset.input_fn( True, per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.epochs_between_evals) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=train_input_fn, hooks=train_hooks) # Evaluate the model eval_results = evaluate_model(estimator, FLAGS.batch_size, num_gpus, ncf_dataset) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info("Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()
def resnet_main(flags, model_function, input_function, num_train_samps, num_eval_samps, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' if flags.multi_gpu: validate_batch_size_for_multi_gpu(flags.batch_size) # There are two steps required if using multi-GPU: (1) wrap the model_fn, # and (2) wrap the optimizer. The first happens here, and (2) happens # in the model_fn itself when the optimizer is defined. model_function = tf.contrib.estimator.replicate_model_fn( loss_reduction=tf.losses.Reduction.MEAN) # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. myrank = 0 numworkers = 1 if (flags.enable_ml_comm == 1): # initialize the Cray PE ML Plugin # config the thread team (correcting the number of epochs for the effectice batch size)) #totsize = sum([reduce(lambda x, y: x*y, v.get_shape().as_list()) for v in tf.trainable_variables()]) totsize = 25551401 #Specific size for resnet50-v2 mc.init(2, 1, totsize, "tensorflow") myrank = mc.get_rank() numworkers = mc.get_nranks() if (myrank == 0): print("ResNet with {:9d} parameters".format(totsize)) max_steps_train = int( math.ceil(flags.train_epochs * (num_train_samps + num_eval_samps) / (mc.get_nranks() * flags.batch_size))) #(0,0,num_steps_before_going_nonblock, max_steps_train, verbose=1, how_often_to_print=100) mc.config_team(0, 0, max_steps_train, max_steps_train, 1, 100) flags.model_dir = flags.model_dir if mc.get_rank() == 0 else None flags.benchmark_log_dir = flags.benchmark_log_dir if mc.get_rank( ) == 0 else None flags.export_dir = flags.export_dir if mc.get_rank() == 0 else None else: rank_id = myrank session_config = tf.ConfigProto( log_device_placement=False, inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) # Set up a RunConfig to save checkpoint and set session config. run_config = tf.estimator.RunConfig().replace( save_checkpoints_steps=500, session_config=session_config) classifier = tf.estimator.Estimator(model_fn=model_function, model_dir=flags.model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'multi_gpu': flags.multi_gpu, 'train_epochs': flags.train_epochs, 'version': flags.version, 'loss_scale': flags.loss_scale, 'dtype': flags.dtype, 'mlcomm': flags.enable_ml_comm, 'log_freq': flags.global_perf_log_freq, 'weight_decay': flags.weight_decay, 'init_lr': flags.init_lr, 'base_lr': flags.base_lr, 'warmup_epochs': flags.warmup_epochs, 'log_freq': flags.global_perf_log_freq, }) benchmark_logger = logger.config_benchmark_logger(flags.benchmark_log_dir) benchmark_logger.log_run_info('resnet') for _ in range(flags.train_epochs // flags.epochs_between_evals): train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) if (myrank == 0): print('Starting a training cycle.') def input_fn_train(): return input_function(True, flags.data_dir, flags.batch_size, flags.epochs_between_evals, flags.num_parallel_calls, flags.multi_gpu, numworkers, myrank) tsteps = math.ceil( float(flags.epochs_between_evals * num_train_samps) / (numworkers * flags.batch_size)) classifier.train(input_fn=input_fn_train, steps=tsteps, max_steps=flags.max_train_steps) if (myrank == 0): print('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function(False, flags.data_dir, flags.batch_size, 3, flags.num_parallel_calls, flags.multi_gpu, numworkers, myrank) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. esteps = math.ceil( float(num_eval_samps) / (numworkers * flags.batch_size)) eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=esteps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold(flags.stop_threshold, eval_results['accuracy']): break if flags.export_dir is not None: warn_on_multi_gpu_export(flags.multi_gpu) # Exports a saved model for the given classifier. input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags.batch_size) classifier.export_savedmodel(flags.export_dir, input_receiver_fn) if (flags.enable_ml_comm == 1): mc.finalize()
def test_config_benchmark_bigquery_logger(self, mock_bigquery_client): with flagsaver.flagsaver(benchmark_logger_type="BenchmarkBigQueryLogger"): logger.config_benchmark_logger() self.assertIsInstance(logger.get_benchmark_logger(), logger.BenchmarkBigQueryLogger)
def test_config_base_benchmark_logger(self): with flagsaver.flagsaver(benchmark_logger_type='BaseBenchmarkLogger'): logger.config_benchmark_logger() self.assertIsInstance(logger.get_benchmark_logger(), logger.BaseBenchmarkLogger)
def train_boosted_trees(flags_obj): """Train boosted_trees estimator on HIGGS data. Args: flags_obj: An object containing parsed flag values. """ # Clean up the model directory if present. if tf.gfile.Exists(flags_obj.model_dir): tf.gfile.DeleteRecursively(flags_obj.model_dir) tf.logging.info("## Data loading...") train_data, eval_data = read_higgs_data( flags_obj.data_dir, flags_obj.train_start, flags_obj.train_count, flags_obj.eval_start, flags_obj.eval_count) tf.logging.info("## Data loaded; train: {}{}, eval: {}{}".format( train_data.dtype, train_data.shape, eval_data.dtype, eval_data.shape)) # Data consists of one label column followed by 28 feature columns. train_input_fn, feature_names, feature_columns = make_inputs_from_np_arrays( features_np=train_data[:, 1:], label_np=train_data[:, 0:1]) eval_input_fn = make_eval_inputs_from_np_arrays( features_np=eval_data[:, 1:], label_np=eval_data[:, 0:1]) tf.logging.info("## Features prepared. Training starts...") # Create benchmark logger to log info about the training and metric values run_params = { "train_start": flags_obj.train_start, "train_count": flags_obj.train_count, "eval_start": flags_obj.eval_start, "eval_count": flags_obj.eval_count, "n_trees": flags_obj.n_trees, "max_depth": flags_obj.max_depth, } benchmark_logger = logger.config_benchmark_logger(flags_obj) benchmark_logger.log_run_info( model_name="boosted_trees", dataset_name="higgs", run_params=run_params, test_id=flags_obj.benchmark_test_id) # Though BoostedTreesClassifier is under tf.estimator, faster in-memory # training is yet provided as a contrib library. classifier = tf.contrib.estimator.boosted_trees_classifier_train_in_memory( train_input_fn, feature_columns, model_dir=flags_obj.model_dir or None, n_trees=flags_obj.n_trees, max_depth=flags_obj.max_depth, learning_rate=flags_obj.learning_rate) # Evaluation. eval_results = classifier.evaluate(eval_input_fn) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Exporting the savedmodel with csv parsing. if flags_obj.export_dir is not None: classifier.export_savedmodel( flags_obj.export_dir, _make_csv_serving_input_receiver_fn( column_names=feature_names, # columns are all floats. column_defaults=[[0.0]] * len(feature_names)), strip_default_attrs=True)
def main(_): # Data preprocessing # The file name of training and test dataset train_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TRAIN_RATINGS_FILENAME) test_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_RATINGS_FILENAME) neg_fname = os.path.join( FLAGS.data_dir, FLAGS.dataset + "-" + constants.TEST_NEG_FILENAME) assert os.path.exists(train_fname), ( "Run data_download.py first to download and extract {} dataset".format( FLAGS.dataset)) tf.logging.info("Data preprocessing...") ncf_dataset = dataset.data_preprocessing( train_fname, test_fname, neg_fname, FLAGS.num_neg) # Create NeuMF model and convert it to Estimator tf.logging.info("Creating Estimator from Keras model...") layers = [int(layer) for layer in FLAGS.layers] mlp_regularization = [float(reg) for reg in FLAGS.mlp_regularization] keras_model = neumf_model.NeuMF( ncf_dataset.num_users, ncf_dataset.num_items, FLAGS.num_factors, layers, FLAGS.batch_size, FLAGS.mf_regularization, mlp_regularization) num_gpus = flags_core.get_num_gpus(FLAGS) estimator = convert_keras_to_estimator(keras_model, num_gpus, FLAGS.model_dir) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( FLAGS.hooks, batch_size=FLAGS.batch_size # for ExamplesPerSecondHook ) run_params = { "batch_size": FLAGS.batch_size, "number_factors": FLAGS.num_factors, "hr_threshold": FLAGS.hr_threshold, "train_epochs": FLAGS.train_epochs, } benchmark_logger = logger.config_benchmark_logger(FLAGS) benchmark_logger.log_run_info( model_name="recommendation", dataset_name=FLAGS.dataset, run_params=run_params) # Training and evaluation cycle def train_input_fn(): return dataset.input_fn( True, per_device_batch_size(FLAGS.batch_size, num_gpus), ncf_dataset, FLAGS.epochs_between_evals) total_training_cycle = FLAGS.train_epochs // FLAGS.epochs_between_evals for cycle_index in range(total_training_cycle): tf.logging.info("Starting a training cycle: {}/{}".format( cycle_index + 1, total_training_cycle)) # Train the model estimator.train(input_fn=train_input_fn, hooks=train_hooks) # Evaluate the model eval_results = evaluate_model( estimator, FLAGS.batch_size, num_gpus, ncf_dataset) # Benchmark the evaluation results benchmark_logger.log_evaluation_result(eval_results) # Log the HR and NDCG results. hr = eval_results[_HR_KEY] ndcg = eval_results[_NDCG_KEY] tf.logging.info( "Iteration {}: HR = {:.4f}, NDCG = {:.4f}".format( cycle_index + 1, hr, ndcg)) # If some evaluation threshold is met if model_helpers.past_stop_threshold(FLAGS.hr_threshold, hr): break # Clear the session explicitly to avoid session delete error tf.keras.backend.clear_session()