def _GetMemoryOptimizerConfig(self): rewrite_options = rewriter_config_pb2.RewriterConfig( memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS) graph_options = config_pb2.GraphOptions( rewrite_options=rewrite_options) return config_pb2.ConfigProto(graph_options=graph_options)
def __init__(self, iterations, eval_steps): tf.logging.info("LowLevelRunner: constructor.") self.fake_feature_structure = {} self.feature_structure = {} self.fake_eval_feature_structure = {} self.eval_feature_structure = {} self.infeed_queue = [] self.eval_infeed_queue = [] self.fake_enqueue_ops = [] self.enqueue_ops = [] self.fake_eval_enqueue_ops = [] self.eval_enqueue_ops = [] self.fake_dataset_initializer = [] self.dataset_initializer = [] self.fake_eval_dataset_initializer = [] self.eval_dataset_initializer = [] self.outfeed_tensors = [] self.outfeed_names = [] self.dequeue_ops = [] self.train_compile_op = None self.eval_compile_op = None self.loss = None self.eval_op = None self.predictions = {} self.iterations = iterations self.eval_steps = eval_steps self.num_hosts = FLAGS.tpu_num_shards // FLAGS.tpu_num_shards_per_host self.scaffold_fn = None self.tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.master or FLAGS.cloud_tpu_name) # Disable grappler for better performance. self.session_config = tf.ConfigProto( allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), isolate_session_state=True, operation_timeout_in_ms=600 * 60 * 1000) # 10 hours cluster_spec = self.tpu_cluster_resolver.cluster_spec() if cluster_spec: self.session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def()) self.input_graph = tf.Graph() self.eval_input_graph = tf.Graph() # Train and eval share the same session and graph so that the weights # can be shared for in memory eval. self.graph = tf.Graph() self.output_graph = tf.Graph() with self.graph.as_default(): if FLAGS.random_seed: tf.random.set_random_seed(FLAGS.random_seed) self.num_epochs_tensor = tf.placeholder( tf.int32, shape=(), name="epochs") self.train_steps_tensor = tf.placeholder( tf.int32, shape=(), name="steps_per_train_loop") self.eval_steps_tensor = tf.placeholder( tf.int32, shape=(), name="steps_per_eval_loop") self.tpu_init = [tpu.initialize_system()] self.tpu_shutdown = tpu.shutdown_system() self.master = self.tpu_cluster_resolver.get_master() self.input_sess = tf.Session( self.master, graph=self.input_graph, config=self.session_config) self.eval_input_sess = tf.Session( self.master, graph=self.eval_input_graph, config=self.session_config) self.sess = tf.Session( self.master, graph=self.graph, config=self.session_config) self.output_sess = tf.Session( self.master, graph=self.output_graph, config=self.session_config) self.sess.run(self.tpu_init) self.infeed_thead = None self.train_eval_thead = None
def get_tensorrt_rewriter_config( cls, rewriter_config_template=None, max_batch_size=1, max_workspace_size_bytes=DEFAULT_TRT_MAX_WORKSPACE_SIZE_BYTES, precision_mode=TrtPrecisionMode.FP32, minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=None, use_calibration=True, use_function_backup=True): """Returns a RewriterConfig proto for TRT transformation. Args: rewriter_config_template: a template RewriterConfig proto used to create a TRT-enabled RewriterConfig. If None, it will use a default one. max_batch_size: max size for the input batch max_workspace_size_bytes: the maximum GPU temporary memory which the TRT engine can use at execution time. This corresponds to the 'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize(). precision_mode: one of TrtPrecisionMode.supported_precision_modes(). minimum_segment_size: the minimum number of nodes required for a subgraph to be replaced by TRTEngineOp. is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT network and engine at run time. maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops. If the number of cached engines is already at max but none of them can serve the input, the TRTEngineOp will fall back to run the TF function based on which the TRTEngineOp is created. cached_engine_batches: a list of batch sizes used to create cached engines, only used when is_dynamic_op is True. The length of the list should be <= maximum_cached_engines, and the dynamic TRT op will use this list to determine the batch sizes of the cached engines, instead of making the decision on the fly. This is useful when we know the most common batch size(s) the application is going to generate. use_calibration: this argument is ignored if precision_mode is not INT8. If set to True, a calibration graph will be created to calibrate the missing ranges. The calibration graph must be converted to an inference graph by running calibration with calibrate(). If set to False, quantization nodes will be expected for every tensor in the graph (exlcuding those which will be fused). If a range is missing, an error will occur. Please note that accuracy may be negatively affected if there is a mismatch between which tensors TRT quantizes and which tensors were trained with fake quantization. use_function_backup: if set to True, it will create a FunctionDef for each subgraph that is converted to TRT op, and if TRT ops fail to execute at runtime, it'll invoke that function as a fallback. Returns: A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler. Raises: TypeError: if any of the parameters are of unexpected type. ValueError: if any of the parameters are of unexpected value. """ # Lazily load the TF-TRT C bindings, so `import tensorflow` doesn't complain # even if it cannot find TensorRT library. trt_ops.load_trt_ops() # pylint: disable=g-import-not-at-top,unused-import,line-too-long,unused-variable # Import a random symbol to trigger loading of TRT library. from tensorflow.python.compiler.tensorrt.wrap_conversion import get_linked_tensorrt_version # pylint: enable=g-import-not-at-top,unused-import,line-too-long,unused-variable if rewriter_config_template is not None and not isinstance( rewriter_config_template, rewriter_config_pb2.RewriterConfig): raise TypeError( "rewriter_config_template should be a RewriterConfig proto.") rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig() if rewriter_config_template is None: # Layout optimizer may add Const nodes followed by Reshape nodes, thus we # need to run constant folding again. rewriter_config_with_trt.optimizers.extend( ["constfold", "layout", "constfold"]) rewriter_config_with_trt.meta_optimizer_iterations = ( rewriter_config_pb2.RewriterConfig.ONE) else: rewriter_config_with_trt.CopyFrom(rewriter_config_template) optimizer = rewriter_config_with_trt.custom_optimizers.add() optimizer.name = "TensorRTOptimizer" optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size optimizer.parameter_map["max_batch_size"].i = max_batch_size optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op optimizer.parameter_map[ "max_workspace_size_bytes"].i = max_workspace_size_bytes optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode) optimizer.parameter_map["maximum_cached_engines"].i = maximum_cached_engines if cached_engine_batches: optimizer.parameter_map["cached_engine_batches"].list.i.extend( cached_engine_batches) optimizer.parameter_map["use_calibration"].b = use_calibration optimizer.parameter_map["use_function_backup"].b = use_function_backup return rewriter_config_with_trt
def freeze_graph_with_def_protos(input_graph_def, input_saver_def, input_checkpoint, output_node_names, restore_op_name, filename_tensor_name, clear_devices, initializer_nodes, optimize_graph=True, variable_names_blacklist=''): """Converts all variables in a graph and checkpoint into constants.""" del restore_op_name, filename_tensor_name # Unused by updated loading code. # 'input_checkpoint' may be a prefix if we're using Saver V2 format if not saver_lib.checkpoint_exists(input_checkpoint): raise ValueError('Input checkpoint "' + input_checkpoint + '" does not exist!') if not output_node_names: raise ValueError( 'You must supply the name of a node to --output_node_names.') # Remove all the explicit device specifications for this node. This helps to # make the graph more portable. if clear_devices: for node in input_graph_def.node: node.device = '' with tf.Graph().as_default(): tf.import_graph_def(input_graph_def, name='') if optimize_graph: logging.info('Graph Rewriter optimizations enabled') rewrite_options = rewriter_config_pb2.RewriterConfig() rewrite_options.optimizers.append('pruning') rewrite_options.optimizers.append('constfold') rewrite_options.optimizers.append('layout') graph_options = tf.GraphOptions(rewrite_options=rewrite_options, infer_shapes=True) else: logging.info('Graph Rewriter optimizations disabled') graph_options = tf.GraphOptions() config = tf.ConfigProto(graph_options=graph_options) with session.Session(config=config) as sess: if input_saver_def: saver = saver_lib.Saver(saver_def=input_saver_def) saver.restore(sess, input_checkpoint) else: var_list = {} reader = pywrap_tensorflow.NewCheckpointReader( input_checkpoint) var_to_shape_map = reader.get_variable_to_shape_map() for key in var_to_shape_map: try: tensor = sess.graph.get_tensor_by_name(key + ':0') except KeyError: # This tensor doesn't exist in the graph (for example it's # 'global_step' or a similar housekeeping element) so skip it. continue var_list[key] = tensor saver = saver_lib.Saver(var_list=var_list) saver.restore(sess, input_checkpoint) if initializer_nodes: sess.run(initializer_nodes) variable_names_blacklist = (variable_names_blacklist.split(',') if variable_names_blacklist else None) output_graph_def = graph_util.convert_variables_to_constants( sess, input_graph_def, output_node_names.split(','), variable_names_blacklist=variable_names_blacklist) return output_graph_def
def get_tensorrt_rewriter_config(rewriter_config=None, max_batch_size=1, max_workspace_size_bytes=2 << 20, precision_mode=TrtPrecisionMode.FP32, minimum_segment_size=3, is_dynamic_op=False, maximum_cached_engines=1, cached_engine_batches=None, use_calibration=True): """Returns a RewriterConfig proto for TRT transformation. Args: rewriter_config: a template RewriterConfig proto used to create a TRT-enabled RewriterConfig. If None, it will use a default one. max_batch_size: max size for the input batch max_workspace_size_bytes: the maximum GPU temporary memory which the TRT engine can use at execution time. This corresponds to the 'workspaceSize' parameter of nvinfer1::IBuilder::setMaxWorkspaceSize(). precision_mode: one of TrtPrecisionMode.supported_precision_modes(). minimum_segment_size: the minimum number of nodes required for a subgraph to be replaced by TRTEngineOp. is_dynamic_op: whether to generate dynamic TRT ops which will build the TRT network and engine at run time. maximum_cached_engines: max number of cached TRT engines in dynamic TRT ops. If the number of cached engines is already at max but none of them can serve the input, the TRTEngineOp will fall back to run the TF function based on which the TRTEngineOp is created. cached_engine_batches: a list of batch sizes used to create cached engines, only used when is_dynamic_op is True. The length of the list should be <= maximum_cached_engines, and the dynamic TRT op will use this list to determine the batch sizes of the cached engines, instead of making the decision on the fly. This is useful when we know the most common batch size(s) the application is going to generate. use_calibration: this argument is ignored if precision_mode is not INT8. If set to True, a calibration graph will be created to calibrate the missing ranges. The calibration graph must be converted to an inference graph using calib_graph_to_infer_graph() after running calibration. if set to False, quantization nodes will be expected for every tensor in the graph (exlcuding those which will be fused). If a range is missing, an error will occur. Please note that accuracy may be negatively affected if there is a mismatch between which tensors TRT quantizes and which tensors were trained with fake quantization. Returns: A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler. Raises: TypeError: if any of the parameters are of unexpected type. ValueError: if any of the parameters are of unexpected value. """ if rewriter_config is not None and not isinstance( rewriter_config, rewriter_config_pb2.RewriterConfig): raise TypeError("rewriter_config should be a RewriterConfig proto.") rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig() if rewriter_config is None: # Layout optimizer may add Const nodes followed by Reshape nodes, thus we # need to run constant folding again. rewriter_config_with_trt.optimizers.extend( ["constfold", "layout", "constfold"]) rewriter_config_with_trt.meta_optimizer_iterations = ( rewriter_config_pb2.RewriterConfig.ONE) else: rewriter_config_with_trt.CopyFrom(rewriter_config) if precision_mode.upper( ) not in TrtPrecisionMode.supported_precision_modes(): raise ValueError(("precision mode '{}' is not supported." "It should be one of {}").format( precision_mode, TrtPrecisionMode.supported_precision_modes)) optimizer = rewriter_config_with_trt.custom_optimizers.add() optimizer.name = "TensorRTOptimizer" optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size optimizer.parameter_map["max_batch_size"].i = max_batch_size optimizer.parameter_map["is_dynamic_op"].b = is_dynamic_op optimizer.parameter_map[ "max_workspace_size_bytes"].i = max_workspace_size_bytes optimizer.parameter_map["precision_mode"].s = _to_bytes(precision_mode) optimizer.parameter_map[ "maximum_cached_engines"].i = maximum_cached_engines if cached_engine_batches: if not isinstance(cached_engine_batches, list): raise TypeError("cached_engine_batches should be a list.") if len(cached_engine_batches) > maximum_cached_engines: raise ValueError( "cached_engine_batches should not contain more than " "maximum_cached_engines items.") optimizer.parameter_map["cached_engine_batches"].list.i.extend( cached_engine_batches) optimizer.parameter_map["use_calibration"].b = use_calibration return rewriter_config_with_trt
def main(unused_argv): steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if FLAGS.use_train_runner: trunner = train_runner.TrainRunner( iterations=FLAGS.iterations_per_loop, train_steps=FLAGS.train_steps) if FLAGS.mode != 'eval': mlperf_log.resnet_print(key=mlperf_log.RUN_START) if FLAGS.use_async_checkpointing or FLAGS.mode == 'in_memory_eval': save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) mlperf_log.resnet_print(key=mlperf_log.INPUT_BATCH_SIZE, value=FLAGS.train_batch_size) mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value='none') if not FLAGS.use_train_runner: config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, save_summary_steps=0, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, # num_shards=FLAGS.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, export_to_tpu=False) assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', ( 'Invalid value for --precision flag; must be bfloat16 or float32.') tf.logging.info('Precision: %s', FLAGS.precision) use_bfloat16 = FLAGS.precision == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=use_bfloat16, transpose_input=FLAGS.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, num_parallel_calls=FLAGS.num_parallel_calls, num_cores=FLAGS.num_prefetch_threads, prefetch_depth_auto_tune=FLAGS.prefetch_depth_auto_tune, use_bfloat16=use_bfloat16) for is_training in [True, False] ] if FLAGS.use_train_runner and FLAGS.mode == 'train': params = {'batch_size': FLAGS.train_batch_size} trunner.initialize(imagenet_train.input_fn, resnet_model_fn, params) mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_TRAIN_EXAMPLES, value=FLAGS.num_train_images) mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES, value=FLAGS.num_eval_images) steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size if FLAGS.mode == 'eval': params = {'batch_size': FLAGS.eval_batch_size} if FLAGS.use_eval_runner: erunner = eval_runner.EvalRunner(input_fn=imagenet_eval.input_fn, model_fn=resnet_model_fn, params=params, num_steps=eval_steps) success = False # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') current_step = int(os.path.basename(ckpt).split('-')[1]) try: start_timestamp = time.time( ) # This time will include compilation time mlperf_log.resnet_print(key=mlperf_log.EVAL_START) if FLAGS.use_eval_runner: eval_results = erunner.eval(num_steps=eval_steps, checkpoint_path=ckpt) else: eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) mlperf_log.resnet_print(key=mlperf_log.EVAL_SIZE, value=FLAGS.num_eval_images) mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP) mlperf_log.resnet_print( key=mlperf_log.EVAL_ACCURACY, value={ 'epoch': max(current_step // steps_per_epoch - 1, 0), 'value': float(eval_results['top_1_accuracy']) }) mlperf_log.resnet_print(key=mlperf_log.EVAL_TARGET, value=FLAGS.stop_threshold) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) if eval_results['top_1_accuracy'] >= FLAGS.stop_threshold: success = True mlperf_log.resnet_print(key=mlperf_log.RUN_STOP, value={'success': 'true'}) break # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if not success: mlperf_log.resnet_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' mlperf_log.resnet_print(key=mlperf_log.TRAIN_LOOP) mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH, value=0) if FLAGS.mode == 'train': if FLAGS.use_train_runner: trunner.train() else: hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) elif FLAGS.mode == 'in_memory_eval': steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size hooks = [] mlperf_log.resnet_print(key=mlperf_log.EVAL_TARGET, value=FLAGS.stop_threshold) hooks.append( in_memory_eval.TPUInMemoryEvalHook( resnet_classifier, imagenet_eval.input_fn, steps_per_epoch, stop_threshold=FLAGS.stop_threshold, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size, every_n_iter=steps_per_epoch * 4)) if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, steps_per_epoch * 4))) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time assert FLAGS.mode == 'train_and_eval' success = False while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) mlperf_log.resnet_print(key=mlperf_log.TRAIN_EPOCH, value=current_step // steps_per_epoch) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') mlperf_log.resnet_print(key=mlperf_log.EVAL_START) eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) mlperf_log.resnet_print(key=mlperf_log.EVAL_SIZE, value=FLAGS.num_eval_images) mlperf_log.resnet_print(key=mlperf_log.EVAL_STOP) mlperf_log.resnet_print( key=mlperf_log.EVAL_ACCURACY, value={ 'epoch': max(0, current_step // steps_per_epoch - 1), 'value': float(eval_results['top_1_accuracy']) }) mlperf_log.resnet_print(key=mlperf_log.EVAL_TARGET, value=FLAGS.stop_threshold) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) if eval_results['top_1_accuracy'] >= FLAGS.stop_threshold: success = True mlperf_log.resnet_print(key=mlperf_log.RUN_STOP, value={'success': 'true'}) break elapsed_time = int(time.time() - start_timestamp) if not success: mlperf_log.resnet_print(key=mlperf_log.RUN_STOP, value={'success': 'false'}) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_savedmodel( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn ) if FLAGS.use_train_runner and FLAGS.mode == 'train': trunner.shutdown() if FLAGS.mode != 'train': mlperf_log.resnet_print(key=mlperf_log.RUN_FINAL)
def main(unused_argv): input_image_size = FLAGS.input_image_size if not input_image_size: if FLAGS.model_name.startswith('efficientnet-edgetpu'): _, _, input_image_size, _ = efficientnet_edgetpu_builder.efficientnet_edgetpu_params( FLAGS.model_name) elif FLAGS.model_name.startswith('efficientnet'): _, _, input_image_size, _ = efficientnet_builder.efficientnet_params( FLAGS.model_name) else: raise ValueError( 'input_image_size must be set except for EfficientNet') # For imagenet dataset, include background label if number of output classes # is 1001 include_background_label = (FLAGS.num_label_classes == 1001) if FLAGS.tpu or FLAGS.use_tpu: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long # Initializes model parameters. params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, use_bfloat16=FLAGS.use_bfloat16) est = tf.contrib.tpu.TPUEstimator(use_tpu=FLAGS.use_tpu, model_fn=model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. def build_imagenet_input(is_training): """Generate ImageNetInput for training and eval.""" if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() return imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=FLAGS.use_bfloat16, transpose_input=FLAGS.transpose_input, selection=select_train if is_training else select_eval, include_background_label=include_background_label, autoaugment_name=FLAGS.autoaugment_name) else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) return imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=input_image_size, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=FLAGS.use_bfloat16, include_background_label=include_background_label, autoaugment_name=FLAGS.autoaugment_name) imagenet_train = build_imagenet_input(is_training=True) imagenet_eval = build_imagenet_input(is_training=False) if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = est.evaluate(input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(est, FLAGS.export_dir, input_image_size) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) est.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) est.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = est.evaluate(input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(est, FLAGS.export_dir, input_image_size)
def main(unused_argv): tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=resnet_model_fn, config=config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, #predict_batch_size=PREDICT_BATCH_SIZE, export_to_tpu=FLAGS.export_to_tpu) assert FLAGS.precision == 'bfloat16' or FLAGS.precision == 'float32', ( 'Invalid value for --precision flag; must be bfloat16 or float32.') tf.logging.info('Precision: %s', FLAGS.precision) use_bfloat16 = FLAGS.precision == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=use_bfloat16, transpose_input=FLAGS.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)]] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, prices_dir=FLAGS.prices_dir, predict_dir=FLAGS.predict_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, price_count=PRICE_COUNT, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir(FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info('model_dir=%s,steps=%d' % (FLAGS.model_dir,current_step)) steps_per_epoch = FLAGS.num_train_images // FLAGS.train_batch_size tf.logging.info('Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / steps_per_epoch, current_step) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) if FLAGS.profile_every_n_steps > 0: hooks.append( tpu_profiler_hook.TPUProfilerHook( save_steps=FLAGS.profile_every_n_steps, output_dir=FLAGS.model_dir, tpu=FLAGS.tpu) ) resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) elif FLAGS.mode == 'train_and_eval': # assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) resnet_classifier.train( input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn= imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) else: # FLAGS.mode == 'predict' price_file_pattern = os.path.join( FLAGS.prices_dir, 'price-*.csv') while True: time.sleep(1) price_files = glob.glob(price_file_pattern) if len(price_files) == 0: continue tf.logging.info('Starting to predict.') for price_file_item in price_files: with open(price_file_item,"r") as fcsv: csvreader = csv.reader(fcsv,delimiter = ",") price_batch_size = len(list(csvreader)) # price_batch_size = PREDICT_BATCH_SIZE if price_batch_size == 0: continue #predictions = next(resnet_classifier.predict( # input_fn=lambda params : imagenet_eval.predict_input_fn(params, price_batch_size), # ), None) predictions = resnet_classifier.predict( input_fn=lambda params : imagenet_eval.predict_input_fn(params, price_batch_size, os.path.basename(price_file_item)), ) tf.logging.info("predictions2 = %s" % predictions) # Output predictions to predict-0001.csv BorisTown predict_filename_part = os.path.join(FLAGS.predict_dir, 'part-0001.part') predict_filename_csv = os.path.join(FLAGS.predict_dir, 'predict-0001.csv') if len(price_files) > 1: dirname = re.findall(r"price-(.+?)\.csv",price_file_item)[0] dirpath = os.path.join(FLAGS.predict_dir, dirname) if not os.path.exists(dirpath): os.makedirs(dirpath) predict_filename_part = os.path.join(dirpath, 'part-0001.part') predict_filename_csv = os.path.join(dirpath, 'predict-0001.csv') predict_file = open(predict_filename_part, "w") predict_file.truncate() predict_line = '' #outarray = np.zeros([price_batch_size, MAX_CASE*LABEL_COUNT]) outarray = np.zeros([price_batch_size, LABEL_COUNT]) for case_index, pred_item in enumerate(predictions): #tf.logging.info("pred_item_probabilities=%s" % (pred_item['probabilities'])) #predict_line = '' for batch_index, pred_operation in enumerate(pred_item['probabilities']): #tf.logging.info("pred_operation.shape=%s" % (pred_operation.shape)) for label_index in range(LABEL_COUNT): #predict_line += str(pred_operation[k]) #tf.logging.info("prediction op:%s" % (pred_operation[label_index])) outarray[batch_index][case_index*LABEL_COUNT+label_index] = pred_operation[label_index] #predict_file.write(predict_line+'\n') #predict_file.close() #tf.logging.info('predict_line = %s' % (predict_line)) for pred_row in outarray: predict_line = '' for pred_col in pred_row: if predict_line != '': predict_line += ',' predict_line += str(pred_col) predict_file.write(predict_line+'\n') tf.logging.info('%s' % (predict_line)) predict_file.close() os.rename(predict_filename_part, predict_filename_csv) if(predict_line != ''): #for price_file in price_files: tf.logging.info('Removing ' + price_file_item) price_file_new = price_file_item.replace("price-", "backup-") os.rename(price_file_item, price_file_new) if FLAGS.export_dir is not None and FLAGS.mode != 'predict': # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') resnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
def initialize_session(self): """Initializes a tf Session.""" if ENABLE_TF_OPTIMIZATIONS: self.sess = tf.Session() else: rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, remapping=rewriter_config_pb2.RewriterConfig.OFF, shape_optimization=rewriter_config_pb2.RewriterConfig.OFF, dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF, function_optimization=rewriter_config_pb2.RewriterConfig.OFF, layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF, loop_optimization=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.NO_MEM_OPT) graph_options = tf.GraphOptions(rewrite_options=rewriter_config) session_config = tf.ConfigProto(graph_options=graph_options) self.sess = tf.Session(config=session_config) # Restore or initialize the variables. self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) if self.learner_config.checkpoint_for_eval: # Requested a specific checkpoint. self.saver.restore(self.sess, self.learner_config.checkpoint_for_eval) tf.logging.info( 'Restored checkpoint: %s' % self.learner_config.checkpoint_for_eval) else: # Continue from the latest checkpoint if one exists. # This handles fault-tolerance. latest_checkpoint = None if self.checkpoint_dir is not None: latest_checkpoint = tf.train.latest_checkpoint(self.checkpoint_dir) if latest_checkpoint: self.saver.restore(self.sess, latest_checkpoint) tf.logging.info('Restored checkpoint: %s' % latest_checkpoint) else: tf.logging.info('No previous checkpoint.') self.sess.run(tf.global_variables_initializer()) self.sess.run(tf.local_variables_initializer()) # For episodic models, potentially use pretrained weights at the start of # training. If this happens it will overwrite the embedding weights, but # taking care to not restore the Adam parameters. if self.learner_config.pretrained_checkpoint and not self.sess.run( tf.train.get_global_step()): self.saver.restore(self.sess, self.learner_config.pretrained_checkpoint) tf.logging.info( 'Restored checkpoint: %s' % self.learner_config.pretrained_checkpoint) # We only want the embedding weights of the checkpoint we just restored. # So we re-initialize everything that's not an embedding weight. Also, # since this episodic finetuning procedure is a different optimization # problem than the original training of the baseline whose embedding # weights are re-used, we do not reload ADAM's variables and instead learn # them from scratch. vars_to_reinit, embedding_var_names, vars_to_reinit_names = [], [], [] for var in tf.global_variables(): if (any(keyword in var.name for keyword in EMBEDDING_KEYWORDS) and 'adam' not in var.name.lower()): embedding_var_names.append(var.name) continue vars_to_reinit.append(var) vars_to_reinit_names.append(var.name) tf.logging.info( 'Initializing all variables except for %s.' % embedding_var_names) self.sess.run(tf.variables_initializer(vars_to_reinit)) tf.logging.info('Re-initialized vars %s.' % vars_to_reinit_names)
dataset = dataset.ImagenetData(data_location) preprocessor = image_preprocessing.ImagePreprocessor( input_height, input_width, batch_size, 1, # device count tf.float32, # data_type for input fed to the graph train=False, # doing inference resize_method='crop') images, labels = preprocessor.minibatch(dataset, subset='validation') graph = load_graph(model_file) input_tensor = graph.get_tensor_by_name(input_layer + ":0") output_tensor = graph.get_tensor_by_name(output_layer + ":0") rewrite_options = rewriter_config_pb2.RewriterConfig( layout_optimizer=rewriter_config_pb2.RewriterConfig.ON) config = tf.compat.v1.ConfigProto() config.inter_op_parallelism_threads = num_inter_threads config.intra_op_parallelism_threads = num_intra_threads config.graph_options.rewrite_options.remapping = ( rewriter_config_pb2.RewriterConfig.OFF) total_accuracy1, total_accuracy5 = (0.0, 0.0) num_processed_images = 0 num_remaining_images = dataset.num_examples_per_epoch(subset='validation') \ - num_processed_images top1 = 0 with tf.compat.v1.Session(graph=data_graph) as sess: ### sess_graph = tf.compat.v1.Session(graph=graph, config=config)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) with tf.name_scope("init_and_save"): init = tf.global_variables_initializer() saver = tf.train.Saver() from tensorflow.examples.tutorials.mnist import input_data mnist = input_data.read_data_sets("/tmp/MNIST_data/data/") from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF, layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF, arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, min_graph_nodes=-1, memory_optimization=rewriter_config_pb2.RewriterConfig.SWAPPING_HEURISTICS) graph_options = tf.GraphOptions( rewrite_options=rewrite_options) #, infer_shapes=True) config = tf.ConfigProto(graph_options=graph_options, allow_soft_placement=True, log_device_placement=True) config.gpu_options.allow_growth = True #run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) #run_metadata = tf.RunMetadata() #graph = tf.get_default_graph()
def _no_rewrite_session_config(self): rewriter_config = rewriter_config_pb2.RewriterConfig( pin_to_host_optimization=rewriter_config_pb2.RewriterConfig.OFF) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config) return config_pb2.ConfigProto(graph_options=graph_options)
def _no_rewrite_session_config(self): rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True) graph_options = config_pb2.GraphOptions( rewrite_options=rewriter_config) return config_pb2.ConfigProto(graph_options=graph_options)
tf.get_variable_scope().reuse_variables() correct_prediction = tf.equal(tf.argmax(logit, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy') # Training algorithm train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) run_metadata = tf.RunMetadata() mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph()) rewrite_options = rewriter_config_pb2.RewriterConfig( memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) graph_options = config_pb2.GraphOptions(rewrite_options=rewrite_options) graph = tf_optimizer.OptimizeGraph(rewrite_options, mg) session_config = None session_config = config_pb2.ConfigProto(graph_options=graph_options) # Training steps with tf.Session(config=session_config) as sess: sess.run(tf.global_variables_initializer()) max_steps = 10 latency = []
def main(unused_argv): # tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # FLAGS.tpu if (FLAGS.tpu or FLAGS.use_tpu) else '', # zone=FLAGS.tpu_zone, # project=FLAGS.gcp_project) if FLAGS.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, FLAGS.iterations_per_loop) NUM_GPUS = len(get_available_gpus()) distribution = tf.contrib.distribute.MirroredStrategy(num_gpus=NUM_GPUS) gpu_options = tf.GPUOptions(allow_growth=True) # config = tf.contrib.tpu.RunConfig( # # cluster=tpu_cluster_resolver, # model_dir=FLAGS.model_dir, # save_checkpoints_steps=save_checkpoints_steps, # log_step_count_steps=FLAGS.log_step_count_steps, # session_config=tf.ConfigProto( # graph_options=tf.GraphOptions( # rewrite_options=rewriter_config_pb2.RewriterConfig( # disable_meta_optimizer=True)), gpu_options=gpu_options), # train_distribute=distribution, # # tpu_config=tf.contrib.tpu.TPUConfig( # # iterations_per_loop=FLAGS.iterations_per_loop, # # per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig # # .PER_HOST_V2) # ) # pylint: disable=line-too-long config = tf.estimator.RunConfig( # cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto(allow_soft_placement=True, graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True)), gpu_options=gpu_options), train_distribute=distribution, # log_step_count_steps=None, # save_summary_steps=None # tpu_config=tf.contrib.tpu.TPUConfig( # iterations_per_loop=FLAGS.iterations_per_loop, # per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig # .PER_HOST_V2) ) # pylint: disable=line-too-long # Initializes model parameters. # params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size) # nas_est = tf.contrib.tpu.TPUEstimator( # use_tpu=FLAGS.use_tpu, # model_fn=nas_model_fn, # config=config, # train_batch_size=FLAGS.train_batch_size, # eval_batch_size=FLAGS.eval_batch_size, # export_to_tpu=FLAGS.export_to_tpu, # params=params) params = dict(steps_per_epoch=FLAGS.num_train_images / FLAGS.train_batch_size, batch_size=FLAGS.train_batch_size) nas_est = tf.estimator.Estimator(model_fn=nas_model_fn, config=config, params=params) # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=False, transpose_input=FLAGS.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=FLAGS.transpose_input, cache=FLAGS.use_cache and is_training, image_size=FLAGS.input_image_size, num_parallel_calls=FLAGS.num_parallel_calls, use_bfloat16=False) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = nas_est.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= FLAGS.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(nas_est, FLAGS.export_dir, FLAGS.post_quantize) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', FLAGS.train_steps, FLAGS.train_steps / params['steps_per_epoch'], current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if FLAGS.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, FLAGS.iterations_per_loop))) nas_est.train(input_fn=imagenet_train.input_fn, max_steps=FLAGS.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < FLAGS.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, FLAGS.train_steps) nas_est.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = nas_est.evaluate( input_fn=imagenet_eval.input_fn, steps=FLAGS.num_eval_images // FLAGS.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', FLAGS.train_steps, elapsed_time) if FLAGS.export_dir: export(nas_est, FLAGS.export_dir, FLAGS.post_quantize)
def main(unused_argv): params = hyperparameters.get_hyperparameters(FLAGS.default_hparams_file, FLAGS.hparams_file, FLAGS, FLAGS.hparams) tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if params['use_async_checkpointing']: save_checkpoints_steps = None else: save_checkpoints_steps = max(2500, params['iterations_per_loop']) config = contrib_tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=get_model_dir(params), save_checkpoints_steps=save_checkpoints_steps, keep_checkpoint_max=None, # Keep all checkpoints. log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=contrib_tpu.TPUConfig( iterations_per_loop=params['iterations_per_loop'], num_shards=params['num_cores'], # copybara:strip_begin tpu_job_name=FLAGS.tpu_job_name, # copybara:strip_end per_host_input_for_training=contrib_tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = contrib_tpu.TPUEstimator( use_tpu=params['use_tpu'], model_fn=resnet_model_fn, config=config, params=params, train_batch_size=params['train_batch_size'], eval_batch_size=params['eval_batch_size'], export_to_tpu=FLAGS.export_to_tpu) # copybara:strip_begin if FLAGS.xla_compile: resnet_classifier = contrib_tpu.TPUEstimator( use_tpu=params['use_tpu'], model_fn=xla.estimator_model_fn(resnet_model_fn), config=config, params=params, train_batch_size=params['train_batch_size'], eval_batch_size=params['eval_batch_size'], export_to_tpu=FLAGS.export_to_tpu) # copybara:strip_end assert (params['precision'] == 'bfloat16' or params['precision'] == 'float32'), ('Invalid value for precision parameter; ' 'must be bfloat16 or float32.') tf.logging.info('Precision: %s', params['precision']) use_bfloat16 = params['precision'] == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train = imagenet_input.ImageNetBigtableInput( is_training=True, use_bfloat16=use_bfloat16, transpose_input=params['transpose_input'], selection=select_train) imagenet_eval = imagenet_input.ImageNetBigtableInput( is_training=False, use_bfloat16=use_bfloat16, transpose_input=params['transpose_input'], selection=select_eval) else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params['transpose_input'], cache=params['use_cache'] and is_training, image_size=params['image_size'], num_parallel_calls=params['num_parallel_calls'], use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = params['num_train_images'] // params['train_batch_size'] eval_steps = params['num_eval_images'] // params['eval_batch_size'] if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( get_model_dir(params), timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params['train_steps']: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) elif FLAGS.mode == 'eval_igt': # IGT evaluation mode. Evaluate metrics for the desired parameters # (true or shifted) on the desired dataset (train or eval). Note that # train is still with data augmentation. # Get checkpoint file names. index_files = tf.gfile.Glob( os.path.join(get_model_dir(params), 'model.ckpt-*.index')) checkpoints = [fn[:-len('.index')] for fn in index_files] # Need to sort them to get proper tensorboard plotting (increasing event # timestamps correspond to increasing steps). checkpoint_steps = [] for ckpt in checkpoints: tf.logging.info(ckpt) step_match = re.match(r'.*model.ckpt-([0-9]*)', ckpt) checkpoint_steps.append(int(step_match.group(1))) checkpoints = [ ckpt for _, ckpt in sorted(zip(checkpoint_steps, checkpoints)) ] tf.logging.info('There are {} checkpoints'.format(len(checkpoints))) tf.logging.info(', '.join(checkpoints)) # Keep track of the last processed checkpoint (fault tolerance). analysis_state_path = os.path.join( get_model_dir(params), 'analysis_state_' + FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode) next_analysis_index = 0 if tf.gfile.Exists(analysis_state_path): with tf.gfile.Open(analysis_state_path) as fd: next_analysis_index = int(fd.read()) # Process each checkpoint. while next_analysis_index < len(checkpoints): tf.logging.info( 'Next analysis index: {}'.format(next_analysis_index)) ckpt_path = checkpoints[next_analysis_index] tf.logging.info('Starting to evaluate: {}.'.format(ckpt_path)) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.igt_eval_set == 'train': the_input_fn = imagenet_train.input_fn the_steps = steps_per_epoch elif FLAGS.igt_eval_set == 'eval': the_input_fn = imagenet_eval.input_fn the_steps = eval_steps else: raise ValueError('Unsupported igt_eval_set') eval_results = resnet_classifier.evaluate( input_fn=the_input_fn, steps=the_steps, checkpoint_path=ckpt_path, name=FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) next_analysis_index += 1 file_io.atomic_write_string_to_file(analysis_state_path, str(next_analysis_index)) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( get_model_dir(params)) # pylint:disable=protected-access,g-line-too-long steps_per_epoch = params['num_train_images'] // params[ 'train_batch_size'] tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params['train_steps'], params['train_steps'] / steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params['use_async_checkpointing']: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=get_model_dir(params), save_steps=max(2500, params['iterations_per_loop']))) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=params['train_steps'], hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < params['train_steps']: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params['train_steps']) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=params['num_eval_images'] // params['eval_batch_size']) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', params['train_steps'], elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') unused_export_path = resnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn )
def no_rewrite_session_config(): rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config) return config_pb2.ConfigProto(graph_options=graph_options)
def main(): np.random.seed(0) tf.set_random_seed(0) dtype = np.float32 # 64-bit doesn't help much, search for 64-bit in # https://www.wolframcloud.com/objects/5f297f41-30f7-4b1b-972c-cac8d1f8d8e4 u.default_dtype = dtype machine_epsilon = np.finfo(dtype).eps # 1e-7 or 1e-16 train_images = load_MNIST.load_MNIST_images('data/train-images-idx3-ubyte') dsize = 10000 patches = train_images[:, :dsize] fs = [dsize, 28 * 28, 196, 28 * 28] # values from deeplearning.stanford.edu/wiki/index.php/UFLDL_Tutorial X0 = patches lambda_ = 3e-3 rho = tf.constant(0.1, dtype=dtype) beta = 3 W0f = W_uniform(fs[2], fs[3]) def f(i): return fs[i + 1] # W[i] has shape f[i] x f[i-1] dsize = f(-1) n = len(fs) - 2 # helper to create variables with numpy or TF initial value init_dict = {} # {var_placeholder: init_value} vard = {} # {var: util.VarInfo} def init_var(val, name, trainable=False, noinit=False): if isinstance(val, tf.Tensor): collections = [] if noinit else None var = tf.Variable(val, name=name, collections=collections) else: val = np.array(val) assert u.is_numeric, "Unknown type" holder = tf.placeholder(dtype, shape=val.shape, name=name + "_holder") var = tf.Variable(holder, name=name, trainable=trainable) init_dict[holder] = val var_p = tf.placeholder(var.dtype, var.shape) var_setter = var.assign(var_p) vard[var] = u.VarInfo(var_setter, var_p) return var lr = init_var(0.2, "lr") if purely_linear: # need lower LR without sigmoids lr = init_var(.02, "lr") Wf = init_var(W0f, "Wf", True) Wf_copy = init_var(W0f, "Wf_copy", True) W = u.unflatten(Wf, fs[1:]) # perftodo: this creates transposes X = init_var(X0, "X") W.insert(0, X) def sigmoid(x): if not purely_linear: return tf.sigmoid(x) else: return tf.identity(x) def d_sigmoid(y): if not purely_linear: return y * (1 - y) else: return 1 def kl(x, y): return x * tf.log(x / y) + (1 - x) * tf.log((1 - x) / (1 - y)) def d_kl(x, y): return (1 - x) / (1 - y) - x / y # A[i] = activations needed to compute gradient of W[i] # A[n+1] = network output A = [None] * (n + 2) # A[0] is just for shape checks, assert fail on run # tf.assert always fails because of static assert # fail_node = tf.assert_equal(1, 0, message="too huge") fail_node = tf.Print(0, [0], "fail, this must never run") with tf.control_dependencies([fail_node]): A[0] = u.Identity(dsize, dtype=dtype) A[1] = W[0] for i in range(1, n + 1): A[i + 1] = sigmoid(W[i] @ A[i]) # reconstruction error and sparsity error err = (A[3] - A[1]) rho_hat = tf.reduce_sum(A[2], axis=1, keep_dims=True) / dsize # B[i] = backprops needed to compute gradient of W[i] # B2[i] = backprops from sampled labels needed for natural gradient B = [None] * (n + 1) B2 = [None] * (n + 1) B[n] = err * d_sigmoid(A[n + 1]) sampled_labels_live = tf.random_normal((f(n), f(-1)), dtype=dtype, seed=0) sampled_labels = init_var(sampled_labels_live, "sampled_labels", noinit=True) B2[n] = sampled_labels * d_sigmoid(A[n + 1]) for i in range(n - 1, -1, -1): backprop = t(W[i + 1]) @ B[i + 1] backprop2 = t(W[i + 1]) @ B2[i + 1] if i == 1 and not drop_sparsity: backprop += beta * d_kl(rho, rho_hat) backprop2 += beta * d_kl(rho, rho_hat) B[i] = backprop * d_sigmoid(A[i + 1]) B2[i] = backprop2 * d_sigmoid(A[i + 1]) # dW[i] = gradient of W[i] dW = [None] * (n + 1) pre_dW = [None] * (n + 1) # preconditioned dW pre_dW_stable = [None] * (n + 1) # preconditioned stable dW cov_A = [None] * (n + 1) # covariance of activations[i] cov_B2 = [None] * (n + 1) # covariance of synthetic backprops[i] vars_svd_A = [None] * (n + 1) vars_svd_B2 = [None] * (n + 1) for i in range(1, n + 1): cov_A[i] = init_var(A[i] @ t(A[i]) / dsize, "cov_A%d" % (i, )) cov_B2[i] = init_var(B2[i] @ t(B2[i]) / dsize, "cov_B2%d" % (i, )) vars_svd_A[i] = u.SvdWrapper(cov_A[i], "svd_A_%d" % (i, )) vars_svd_B2[i] = u.SvdWrapper(cov_B2[i], "svd_B2_%d" % (i, )) if use_tikhonov: whitened_A = u.regularized_inverse2(vars_svd_A[i], L=Lambda) @ A[i] else: whitened_A = u.pseudo_inverse2(vars_svd_A[i]) @ A[i] if use_tikhonov: whitened_B2 = u.regularized_inverse2(vars_svd_B2[i], L=Lambda) @ B[i] else: whitened_B2 = u.pseudo_inverse2(vars_svd_B2[i]) @ B[i] whitened_A_stable = u.pseudo_inverse_sqrt2(vars_svd_A[i]) @ A[i] whitened_B2_stable = u.pseudo_inverse_sqrt2(vars_svd_B2[i]) @ B[i] pre_dW[i] = (whitened_B2 @ t(whitened_A)) / dsize pre_dW_stable[i] = (whitened_B2_stable @ t(whitened_A_stable)) / dsize dW[i] = (B[i] @ t(A[i])) / dsize # Loss function reconstruction = u.L2(err) / (2 * dsize) sparsity = beta * tf.reduce_sum(kl(rho, rho_hat)) L2 = (lambda_ / 2) * (u.L2(W[1]) + u.L2(W[1])) loss = reconstruction if not drop_l2: loss = loss + L2 if not drop_sparsity: loss = loss + sparsity grad_live = u.flatten(dW[1:]) pre_grad_live = u.flatten(pre_dW[1:]) # fisher preconditioned gradient pre_grad_stable_live = u.flatten( pre_dW_stable[1:]) # sqrt fisher preconditioned grad grad = init_var(grad_live, "grad") pre_grad = init_var(pre_grad_live, "pre_grad") pre_grad_stable = init_var(pre_grad_stable_live, "pre_grad_stable") update_params_op = Wf.assign(Wf - lr * pre_grad).op update_params_stable_op = Wf.assign(Wf - lr * pre_grad_stable).op save_params_op = Wf_copy.assign(Wf).op pre_grad_dot_grad = tf.reduce_sum(pre_grad * grad) pre_grad_stable_dot_grad = tf.reduce_sum(pre_grad * grad) grad_norm = tf.reduce_sum(grad * grad) pre_grad_norm = u.L2(pre_grad) pre_grad_stable_norm = u.L2(pre_grad_stable) def dump_svd_info(step): """Dump singular values and gradient values in those coordinates.""" for i in range(1, n + 1): svd = vars_svd_A[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) util.dump(s0, "A_%d_%d" % (i, step)) A0 = A[i].eval() At0 = v0.T @ A0 util.dump(A0 @ A0.T, "Acov_%d_%d" % (i, step)) util.dump(At0 @ At0.T, "Atcov_%d_%d" % (i, step)) util.dump(s0, "As_%d_%d" % (i, step)) for i in range(1, n + 1): svd = vars_svd_B2[i] s0, u0, v0 = sess.run([svd.s, svd.u, svd.v]) util.dump(s0, "B2_%d_%d" % (i, step)) B0 = B[i].eval() Bt0 = v0.T @ B0 util.dump(B0 @ B0.T, "Bcov_%d_%d" % (i, step)) util.dump(Bt0 @ Bt0.T, "Btcov_%d_%d" % (i, step)) util.dump(s0, "Bs_%d_%d" % (i, step)) def advance_batch(): sess.run(sampled_labels.initializer) # new labels for next call def update_covariances(): ops_A = [cov_A[i].initializer for i in range(1, n + 1)] ops_B2 = [cov_B2[i].initializer for i in range(1, n + 1)] sess.run(ops_A + ops_B2) def update_svds(): if whitening_mode > 1: vars_svd_A[2].update() if whitening_mode > 2: vars_svd_B2[2].update() if whitening_mode > 3: vars_svd_B2[1].update() def init_svds(): """Initialize our SVD to identity matrices.""" ops = [] for i in range(1, n + 1): ops.extend(vars_svd_A[i].init_ops) ops.extend(vars_svd_B2[i].init_ops) sess = tf.get_default_session() sess.run(ops) init_op = tf.global_variables_initializer() # tf.get_default_graph().finalize() from tensorflow.core.protobuf import rewriter_config_pb2 rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, constant_folding=rewriter_config_pb2.RewriterConfig.OFF, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) optimizer_options = tf.OptimizerOptions(opt_level=tf.OptimizerOptions.L0) graph_options = tf.GraphOptions(optimizer_options=optimizer_options, rewrite_options=rewrite_options) config = tf.ConfigProto(graph_options=graph_options) #sess = tf.Session(config=config) sess = tf.InteractiveSession(config=config) sess.run(Wf.initializer, feed_dict=init_dict) sess.run(X.initializer, feed_dict=init_dict) advance_batch() update_covariances() init_svds() sess.run(init_op, feed_dict=init_dict) # initialize everything else print("Running training.") u.reset_time() step_lengths = [] # keep track of learning rates losses = [] ratios = [] # actual loss decrease / expected decrease grad_norms = [] pre_grad_norms = [] # preconditioned grad norm squared pre_grad_stable_norms = [] # sqrt preconditioned grad norms squared target_delta_list = [] # predicted decrease linear approximation target_delta2_list = [] # predicted decrease quadratic appromation actual_delta_list = [] # actual decrease # adaptive line search parameters alpha = 0.3 # acceptable fraction of predicted decrease beta = 0.8 # how much to shrink when violation growth_rate = 1.05 # how much to grow when too conservative def update_cov_A(i): sess.run(cov_A[i].initializer) def update_cov_B2(i): sess.run(cov_B2[i].initializer) # only update whitening matrix of input activations in the beginning if whitening_mode > 0: vars_svd_A[1].update() # compute t(delta).H.delta/2 def hessian_quadratic(delta): # update_covariances() W = u.unflatten(delta, fs[1:]) W.insert(0, None) total = 0 for l in range(1, n + 1): decrement = tf.trace(t(W[l]) @ cov_B2[l] @ W[l] @ cov_A[l]) total += decrement return (total / 2).eval() # compute t(delta).H^-1.delta/2 def hessian_quadratic_inv(delta): # update_covariances() W = u.unflatten(delta, fs[1:]) W.insert(0, None) total = 0 for l in range(1, n + 1): invB2 = u.pseudo_inverse2(vars_svd_B2[l]) invA = u.pseudo_inverse2(vars_svd_A[l]) decrement = tf.trace(t(W[l]) @ invB2 @ W[l] @ invA) total += decrement return (total / 2).eval() # do line search, dump values as csv def line_search(initial_value, direction, step, num_steps): saved_val = tf.Variable(Wf) sess.run(saved_val.initializer) pl = tf.placeholder(dtype, shape=(), name="linesearch_p") assign_op = Wf.assign(initial_value - direction * step * pl) vals = [] for i in range(num_steps): sess.run(assign_op, feed_dict={pl: i}) vals.append(loss.eval()) sess.run(Wf.assign(saved_val)) # restore original value return vals for step in range(num_steps): update_covariances() if step % whiten_every_n_steps == 0: update_svds() sess.run(grad.initializer) sess.run(pre_grad.initializer) lr0, loss0 = sess.run([lr, loss]) save_params_op.run() # regular inverse becomes unstable when grad norm exceeds 1 stabilized_mode = grad_norm.eval() < 1 if stabilized_mode and not use_tikhonov: update_params_stable_op.run() else: update_params_op.run() loss1 = loss.eval() advance_batch() # line search stuff target_slope = (-pre_grad_dot_grad.eval() if stabilized_mode else -pre_grad_stable_dot_grad.eval()) target_delta = lr0 * target_slope target_delta_list.append(target_delta) # second order prediction of target delta # TODO: the sign is wrong, debug this # https://www.wolframcloud.com/objects/8f287f2f-ceb7-42f7-a599-1c03fda18f28 if local_quadratics: x0 = Wf_copy.eval() x_opt = x0 - pre_grad.eval() # computes t(x)@H^-1 @(x)/2 y_opt = loss0 - hessian_quadratic_inv(grad) # computes t(x)@H @(x)/2 y_expected = hessian_quadratic(Wf - x_opt) + y_opt target_delta2 = y_expected - loss0 target_delta2_list.append(target_delta2) actual_delta = loss1 - loss0 actual_slope = actual_delta / lr0 slope_ratio = actual_slope / target_slope # between 0 and 1.01 actual_delta_list.append(actual_delta) if do_line_search: vals1 = line_search(Wf_copy, pre_grad, lr / 100, 40) vals2 = line_search(Wf_copy, grad, lr / 100, 40) u.dump(vals1, "line1-%d" % (i, )) u.dump(vals2, "line2-%d" % (i, )) losses.append(loss0) step_lengths.append(lr0) ratios.append(slope_ratio) grad_norms.append(grad_norm.eval()) pre_grad_norms.append(pre_grad_norm.eval()) pre_grad_stable_norms.append(pre_grad_stable_norm.eval()) if step % report_frequency == 0: print( "Step %d loss %.2f, target decrease %.3f, actual decrease, %.3f ratio %.2f grad norm: %.2f pregrad norm: %.2f" % (step, loss0, target_delta, actual_delta, slope_ratio, grad_norm.eval(), pre_grad_norm.eval())) if adaptive_step_frequency and adaptive_step and step > adaptive_step_burn_in: # shrink if wrong prediction, don't shrink if prediction is tiny if slope_ratio < alpha and abs( target_delta) > 1e-6 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print( "Slope optimality %.2f, shrinking learning rate to %.2f" % ( slope_ratio, lr0 * beta, )) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * beta}) # grow learning rate, slope_ratio .99 worked best for gradient elif step > 0 and i % 50 == 0 and slope_ratio > 0.90 and adaptive_step: print("%.2f %.2f %.2f" % (loss0, loss1, slope_ratio)) print("Growing learning rate to %.2f" % (lr0 * growth_rate)) sess.run(vard[lr].setter, feed_dict={vard[lr].p: lr0 * growth_rate}) u.record_time() # check against expected loss if 'Apple' in sys.version: pass # u.dump(losses, "kfac_small_final_mac.csv") targets = np.loadtxt("data/kfac_small_final_mac.csv", delimiter=",") else: pass # u.dump(losses, "kfac_small_final_linux.csv") targets = np.loadtxt("data/kfac_small_final_linux.csv", delimiter=",") u.check_equal(targets, losses[:len(targets)], rtol=1e-1) u.summarize_time() print("Test passed")
def main(unused_argv): params = params_dict.ParamsDict(resnet_config.RESNET_CFG, resnet_config.RESNET_RESTRICTIONS) params = params_dict.override_params_dict(params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params = flags_to_params.override_params_from_input_flags(params, FLAGS) params.validate() params.lock() tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu if (FLAGS.tpu or params.use_tpu) else '', zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) if params.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(5000, params.iterations_per_loop) config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, keep_checkpoint_max=1000, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=params.iterations_per_loop, num_shards=params.num_cores, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long resnet_classifier = tf.contrib.tpu.TPUEstimator( use_tpu=params.use_tpu, model_fn=resnet_model_fn, config=config, params=params.as_dict(), train_batch_size=params.train_batch_size, eval_batch_size=params.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu) assert (params.precision == 'bfloat16' or params.precision == 'float32'), ('Invalid value for precision parameter; ' 'must be bfloat16 or float32.') tf.logging.info('Precision: %s', params.precision) use_bfloat16 = params.precision == 'bfloat16' # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [ imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=use_bfloat16, transpose_input=params.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)] ] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params.transpose_input, cache=params.use_cache and is_training, image_size=params.image_size, num_parallel_calls=params.num_parallel_calls, include_background_label=(params.num_label_classes == 1001), use_bfloat16=use_bfloat16) for is_training in [True, False] ] steps_per_epoch = params.num_train_images // params.train_batch_size eval_steps = params.num_eval_images // params.eval_batch_size if FLAGS.mode == 'eval': # Run evaluation when there's a new checkpoint for ckpt in evaluation.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time( ) # This time will include compilation time eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params.train_steps: tf.logging.info( 'Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info( 'Checkpoint %s no longer exists, skipping checkpoint', ckpt) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' current_step = estimator._load_global_step_from_checkpoint_dir( FLAGS.model_dir) # pylint: disable=protected-access,line-too-long steps_per_epoch = params.num_train_images // params.train_batch_size tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params.train_steps, params.train_steps / steps_per_epoch, current_step) start_timestamp = time.time( ) # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params.use_async_checkpointing: hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(5000, params.iterations_per_loop))) if FLAGS.profile_every_n_steps > 0: hooks.append( tpu_profiler_hook.TPUProfilerHook( save_steps=FLAGS.profile_every_n_steps, output_dir=FLAGS.model_dir, tpu=FLAGS.tpu)) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=params.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < params.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params.train_steps) resnet_classifier.train(input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = resnet_classifier.evaluate( input_fn=imagenet_eval.input_fn, steps=params.num_eval_images // params.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) elapsed_time = int(time.time() - start_timestamp) tf.logging.info( 'Finished training up to step %d. Elapsed seconds %d.', params.train_steps, elapsed_time) if FLAGS.export_dir is not None: # The guide to serve a exported TensorFlow model is at: # https://www.tensorflow.org/serving/serving_basic tf.logging.info('Starting to export model.') export_path = resnet_classifier.export_saved_model( export_dir_base=FLAGS.export_dir, serving_input_receiver_fn=imagenet_input.image_serving_input_fn ) if FLAGS.add_warmup_requests: inference_warmup.write_warmup_requests( export_path, FLAGS.model_name, params.image_size, batch_sizes=FLAGS.inference_batch_sizes, image_format='JPEG')
def _GetMemoryOptimizerSessionConfig(self): rewrite_options = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, memory_optimization=rewriter_config_pb2.RewriterConfig.HEURISTICS) graph_options = config_pb2.GraphOptions(rewrite_options=rewrite_options) return config_pb2.ConfigProto(graph_options=graph_options)
def test_unifiedRNN_with_cond(self): # This test is to demonstrate the graph rewrite of grappler plugin under # the condition that the function returns different number of internal # states. rewrites = rewriter_config_pb2.RewriterConfig() rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF customer_optimizer = rewrites.custom_optimizers.add() customer_optimizer.name = 'ExperimentalImplementationSelector' rewrites.min_graph_nodes = -1 graph_options = config_pb2.GraphOptions(rewrite_options=rewrites) config = config_pb2.ConfigProto(graph_options=graph_options) input_shape = 10 rnn_state_size = 8 output_shape = 8 timestep = 4 batch = 100 epoch = 1 with ops.Graph().as_default(), session.Session(config=config) as sess: (x_train, y_train), _ = testing_utils.get_test_data( train_samples=batch, test_samples=0, input_shape=(timestep, input_shape), num_classes=output_shape) y_train = keras.utils.to_categorical(y_train) layer = UnifiedLSTM(rnn_state_size) inputs = array_ops.placeholder( dtypes.float32, shape=(None, timestep, input_shape), name='inputs') predict = array_ops.placeholder( dtypes.float32, shape=(None, output_shape), name='predict') zeros = array_ops.zeros([batch, output_shape]) dummy_runtime = constant_op.constant( 'unknown', dtype=dtypes.string, name='runtime') a = constant_op.constant(0) b = constant_op.constant(1) # Will always run the lstm layer. outputs, runtime = control_flow_ops.cond( gen_math_ops.less(a, b), lambda: layer(inputs), lambda: (zeros, dummy_runtime)) loss = losses.softmax_cross_entropy(predict, outputs) optimizer = gradient_descent.GradientDescentOptimizer(0.001) train_op = optimizer.minimize(loss) sess.run([variables.global_variables_initializer()]) existing_loss = 0 for _ in range(epoch): loss_value, _, runtime_value = sess.run([loss, train_op, runtime], { inputs: x_train, predict: y_train }) if test.is_gpu_available(): self.assertEquals(runtime_value, b'cudnn') else: self.assertEquals(runtime_value, b'cpu') # Make sure the loss is updated for every epoch # (layer weights properly updated). self.assertNotEqual(existing_loss, loss_value) existing_loss = loss_value
def main(unused_argv): params = params_dict.ParamsDict( mnasnet_config.MNASNET_CFG, mnasnet_config.MNASNET_RESTRICTIONS) params = params_dict.override_params_dict( params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict( params, FLAGS.params_override, is_strict=True) params = flags_to_params.override_params_from_input_flags(params, FLAGS) additional_params = { 'steps_per_epoch': params.num_train_images / params.train_batch_size, 'quantized_training': FLAGS.quantized_training, } params = params_dict.override_params_dict( params, additional_params, is_strict=False) params.validate() params.lock() if FLAGS.tpu or params.use_tpu: tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver( FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) else: tpu_cluster_resolver = None if params.use_async_checkpointing: save_checkpoints_steps = None else: save_checkpoints_steps = max(100, params.iterations_per_loop) config = tf.estimator.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=FLAGS.model_dir, save_checkpoints_steps=save_checkpoints_steps, log_step_count_steps=FLAGS.log_step_count_steps, session_config=tf.ConfigProto( graph_options=tf.GraphOptions( rewrite_options=rewriter_config_pb2.RewriterConfig( disable_meta_optimizer=True))), tpu_config=tf.estimator.tpu.TPUConfig( iterations_per_loop=params.iterations_per_loop, per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig .PER_HOST_V2)) # pylint: disable=line-too-long # Validates Flags. if params.precision == 'bfloat16' and params.use_keras: raise ValueError( 'Keras layers do not have full support to bfloat16 activation training.' ' You have set precision as %s and use_keras as %s' % (params.precision, params.use_keras)) # Initializes model parameters. mnasnet_est = tf.estimator.tpu.TPUEstimator( use_tpu=params.use_tpu, model_fn=build_model_fn, config=config, train_batch_size=params.train_batch_size, eval_batch_size=params.eval_batch_size, export_to_tpu=FLAGS.export_to_tpu, params=params.as_dict()) if FLAGS.mode == 'export_only': export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize) return # Input pipelines are slightly different (with regards to shuffling and # preprocessing) between training and evaluation. if FLAGS.bigtable_instance: tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table) select_train, select_eval = _select_tables_from_flags() imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput( is_training=is_training, use_bfloat16=False, transpose_input=params.transpose_input, selection=selection) for (is_training, selection) in [(True, select_train), (False, select_eval)]] else: if FLAGS.data_dir == FAKE_DATA_DIR: tf.logging.info('Using fake dataset.') else: tf.logging.info('Using dataset: %s', FLAGS.data_dir) imagenet_train, imagenet_eval = [ imagenet_input.ImageNetInput( is_training=is_training, data_dir=FLAGS.data_dir, transpose_input=params.transpose_input, cache=params.use_cache and is_training, image_size=params.input_image_size, num_parallel_calls=params.num_parallel_calls, use_bfloat16=(params.precision == 'bfloat16')) for is_training in [True, False] ] if FLAGS.mode == 'eval': eval_steps = params.num_eval_images // params.eval_batch_size # Run evaluation when there's a new checkpoint for ckpt in tf.train.checkpoints_iterator( FLAGS.model_dir, timeout=FLAGS.eval_timeout): tf.logging.info('Starting to evaluate.') try: start_timestamp = time.time() # This time will include compilation time eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.input_fn, steps=eval_steps, checkpoint_path=ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results, elapsed_time) mnas_utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) # Terminate eval job when final checkpoint is reached current_step = int(os.path.basename(ckpt).split('-')[1]) if current_step >= params.train_steps: tf.logging.info('Evaluation finished after training step %d', current_step) break except tf.errors.NotFoundError: # Since the coordinator is on a different job than the TPU worker, # sometimes the TPU worker does not finish initializing until long after # the CPU job tells it to start evaluating. In this case, the checkpoint # file could have been deleted already. tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint', ckpt) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize) else: # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval' try: current_step = tf.train.load_variable(FLAGS.model_dir, tf.GraphKeys.GLOBAL_STEP) except (TypeError, ValueError, tf.errors.NotFoundError): current_step = 0 tf.logging.info( 'Training for %d steps (%.2f epochs in total). Current' ' step %d.', params.train_steps, params.train_steps / params.steps_per_epoch, current_step) start_timestamp = time.time() # This time will include compilation time if FLAGS.mode == 'train': hooks = [] if params.use_async_checkpointing: try: from tensorflow.contrib.tpu.python.tpu import async_checkpoint # pylint: disable=g-import-not-at-top except ImportError as e: logging.exception( 'Async checkpointing is not supported in TensorFlow 2.x') raise e hooks.append( async_checkpoint.AsyncCheckpointSaverHook( checkpoint_dir=FLAGS.model_dir, save_steps=max(100, params.iterations_per_loop))) mnasnet_est.train( input_fn=imagenet_train.input_fn, max_steps=params.train_steps, hooks=hooks) else: assert FLAGS.mode == 'train_and_eval' while current_step < params.train_steps: # Train for up to steps_per_eval number of steps. # At the end of training, a checkpoint will be written to --model_dir. next_checkpoint = min(current_step + FLAGS.steps_per_eval, params.train_steps) mnasnet_est.train( input_fn=imagenet_train.input_fn, max_steps=next_checkpoint) current_step = next_checkpoint tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', next_checkpoint, int(time.time() - start_timestamp)) # Evaluate the model on the most recent model in --model_dir. # Since evaluation happens in batches of --eval_batch_size, some images # may be excluded modulo the batch size. As long as the batch size is # consistent, the evaluated images are also consistent. tf.logging.info('Starting to evaluate.') eval_results = mnasnet_est.evaluate( input_fn=imagenet_eval.input_fn, steps=params.num_eval_images // params.eval_batch_size) tf.logging.info('Eval results at step %d: %s', next_checkpoint, eval_results) ckpt = tf.train.latest_checkpoint(FLAGS.model_dir) mnas_utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt) elapsed_time = int(time.time() - start_timestamp) tf.logging.info('Finished training up to step %d. Elapsed seconds %d.', params.train_steps, elapsed_time) if FLAGS.export_dir: export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
from tensorflow.python.framework import random_seed from tensorflow.python.framework import test_util from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import testing_utils from tensorflow.python.keras.layers import recurrent as rnn_v1 from tensorflow.python.keras.layers import recurrent_v2 as rnn from tensorflow.python.ops import array_ops from tensorflow.python.ops import control_flow_ops from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops from tensorflow.python.platform import test from tensorflow.python.training import gradient_descent # Global config for grappler setting that is used for graph mode test. _rewrites = rewriter_config_pb2.RewriterConfig() _rewrites.implementation_selector = rewriter_config_pb2.RewriterConfig.ON _rewrites.min_graph_nodes = -1 _graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites) _config = config_pb2.ConfigProto(graph_options=_graph_options) @keras_parameterized.run_all_keras_modes(config=_config) class GRUV2Test(keras_parameterized.TestCase): @parameterized.named_parameters( ('non_tan_activation', 'relu', 'sigmoid', 0, False, True, True), ('non_sigmoid_recur_activation', 'tanh', 'relu', 0, False, True, True), ('use_recurrent_dropout', 'tanh', 'sigmoid', 0.1, False, True, True), ('unroll', 'tanh', 'sigmoid', 0, True, True, True), ('not_use_bias', 'tanh', 'sigmoid', 0, False, False, True), ('not_reset_after', 'tanh', 'sigmoid', 0, False, True, False))
def PlaceGraph(metagraph, cluster=None, allotted_time=3600, hparams=None, verbose=False): """Place the provided metagraph. Args: metagraph: the metagraph to place. cluster: an optional set of hardware resource to optimize the placement for. If none is specified, we'll optimize the placement for the hardware available on the local machine. allotted_time: the maximum amount to time in seconds to spend optimizing the placement. hparams: hyperparameters used to fine tune the placer. verbose: prints debug information if True. Returns: The placed metagraph. """ if cluster is None: cluster = gcluster.Cluster() # Optimize the metagraph to speedup the placement rewriter_config = rewriter_config_pb2.RewriterConfig() rewriter_config.optimizers.append("pruning") rewriter_config.optimizers.append("constfold") rewriter_config.optimizers.append("arithmetic") rewriter_config.optimizers.append("dependency") rewriter_config.optimizers.append("pruning") optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph, verbose=verbose, cluster=cluster) optimized_metagraph = meta_graph_pb2.MetaGraphDef() optimized_metagraph.CopyFrom(metagraph) optimized_metagraph.graph_def.CopyFrom(optimized_graph) item = gitem.Item(optimized_metagraph) # Measure the runtime achievable with the original placement. try: _, original_run_time, _ = cluster.MeasureCosts(item) if verbose: print("Runtime for original placement: " + str(original_run_time)) except errors.OpError as e: if verbose: print("Original placement isn't feasible: " + str(e)) original_run_time = hparams.failing_signal if hparams is None: hparams = hierarchical_controller.hierarchical_controller_hparams() # We run with a single child hparams.num_children = 1 with tf_ops.Graph().as_default(): # Place all the nodes of the controller on the CPU. We don't want them to # fight for accelerator memory with the model to optimize. with tf_ops.device("/device:CPU:0"): model = hierarchical_controller.HierarchicalController( hparams, item, cluster) ops = model.build_controller() session_creator = training.ChiefSessionCreator() with training.MonitoredSession( session_creator=session_creator) as sess: start_time = time.time() current_time = start_time while current_time - start_time < allotted_time: grouping_actions = model.generate_grouping(sess) input_to_seq2seq = model.create_group_embeddings( grouping_actions, verbose=verbose) model.generate_placement(input_to_seq2seq, sess) try: run_time = model.eval_placement(sess, verbose=verbose) except errors.OpError as e: if verbose: print("Failed to run graph:" + str(e)) run_time = hparams.failing_signal updated = model.update_reward(sess, run_time, verbose=verbose) if updated and run_time < original_run_time: if verbose: print("Found better placement, with runtime " + str(run_time)) model.export_placement(metagraph) model.process_reward(sess) current_time = time.time() return metagraph