def get_npu_classifier(self): if self.config.chip == 'npu': from npu_bridge.estimator.npu.npu_config import NPURunConfig from npu_bridge.estimator.npu.npu_estimator import NPUEstimator ## refer to links # https://support.huaweicloud.com/tensorflowdevg-cann330alphaXtraining/atlasmprtg_13_0032.html # https://support.huaweicloud.com/tensorflowdevg-cann330alphaXtraining/atlastfadapi_07_0004.html if self.config.npu_profiling: from npu_bridge.estimator.npu.npu_config import ProfilingConfig work_dir = os.getcwd() profiling_dir = os.path.join(work_dir, "npu_profiling") if not os.path.exists(profiling_dir): os.makedirs(profiling_dir) profiling_options = '{"output":"%s","task_trace": "on","aicpu": "on"}' % ( profiling_dir) profiling_config = ProfilingConfig( enable_profiling=True, profiling_options=profiling_options) run_config = NPURunConfig( profiling_config=profiling_config, ## 配置profiling参数 save_checkpoints_steps=112590, session_config=self.sess.estimator_config, model_dir=self.config.log_dir, keep_checkpoint_max=self.config.max_checkpoint_to_save) else: run_config = NPURunConfig( save_checkpoints_steps=112590, session_config=self.sess.estimator_config, model_dir=self.config.log_dir, keep_checkpoint_max=self.config.max_checkpoint_to_save) classifier = NPUEstimator( model_fn=self.model.get_estimator_model_func, config=run_config) elif self.config.chip in ['gpu', 'cpu']: run_config = tf.estimator.RunConfig( save_checkpoints_steps=112590, session_config=self.sess.estimator_config, model_dir=self.config.log_dir, keep_checkpoint_max=self.config.max_checkpoint_to_save) classifier = tf.estimator.Estimator( model_fn=self.model.get_estimator_model_func, config=run_config) training_hooks = [] training_hooks.append(self.logger) return classifier, training_hooks
def _init_npu_estimator(self, sess_config): model_dir = self.get_local_worker_path() config = NPURunConfig(model_dir=model_dir, save_checkpoints_steps=self.config.save_steps, log_step_count_steps=self.config.report_freq, session_config=sess_config, enable_data_pre_proc=True, iterations_per_loop=1) self.estimator = NPUEstimator(model_fn=self.model_fn, config=config)
def _refresh_model_dir_and_session_config(config, model_dir): """Overwrite estimator config by `model_dir` and `session_config` if needed. Args: config: Original estimator config. model_dir: Estimator model checkpoint directory. Returns: Overwritten estimator config. Raises: ValueError: Model directory inconsistent between `model_dir` and `config`. """ if config is None or not isinstance(config, NPURunConfig): raise ValueError( 'config must be an instance of `NPURunConfig`, but provided %s.' % config) if config.session_config is None: session_config = run_config.get_default_session_config() config = NPURunConfig.replace(config, session_config=session_config) model_dir = compat_internal.path_to_str(model_dir) if model_dir is not None: if (getattr(config, 'model_dir', None) is not None and config.model_dir != model_dir): raise ValueError( "`model_dir` are set both in constructor and `NPURunConfig`, but with " "different values. In constructor: '{}', in `NPURunConfig`: " "'{}' ".format(model_dir, config.model_dir)) if model_dir: config = NPURunConfig.replace(config, model_dir=model_dir) elif getattr(config, 'model_dir', None) is None: model_dir = tempfile.mkdtemp() logging.warning('Using temporary folder as model directory: %s', model_dir) config = NPURunConfig.replace(config, model_dir=model_dir) return config
def main(unused_argv): # set the log set_log() # Load training and eval data mnist = input_data.read_data_sets(FLAGS.data_dir, False) train_data = mnist.train.images # Returns np.array train_labels = np.asarray(mnist.train.labels, dtype=np.int32) eval_data = mnist.test.images # Returns np.array eval_labels = np.asarray(mnist.test.labels, dtype=np.int32) # profiling_config = ProfilingConfig(enable_profiling=True, # enable_options=["training_trace","task_trace"]) npu_config = NPURunConfig(iterations_per_loop=100, save_checkpoints_steps=10, model_dir=FLAGS.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False) #,profiling_config=profiling_config ) mnist_classifier = NPUEstimator(model_fn=cnn_model_fn, config=npu_config, params={}, job_start_file=FLAGS.job_start_file) # Set up logging for predictions # Log the values in the "Softmax" tensor with label "probabilities" tensors_to_log = {"probabilities": "softmax_tensor"} logging_hook = tf.train.LoggingTensorHook( tensors=tensors_to_log, every_n_iter=50) # Train the model # print("Train the model...") mnist_classifier.train(input_fn=train_input_fn(train_data, train_labels), steps=FLAGS.train_steps, hooks=[logging_hook]) # Evaluate the model and print results print("Evaluate the model...") eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn( eval_data, eval_labels), steps=FLAGS.eval_steps) print("eval_results: ", eval_results)
def get_npu_classifier(self): from npu_bridge.estimator.npu.npu_config import NPURunConfig from npu_bridge.estimator.npu.npu_estimator import NPUEstimator run_config = NPURunConfig( hcom_parallel=True, precision_mode="allow_mix_precision", enable_data_pre_proc=True, save_checkpoints_steps=self.args.nsteps_per_epoch, session_config=self.sess.estimator_config, model_dir=self.args.log_dir, iterations_per_loop=self.args.iterations_per_loop, keep_checkpoint_max=5) classifier = NPUEstimator(model_fn=self.model.get_estimator_model_func, config=run_config) training_hooks = [] training_hooks.append(self.logger) return classifier, training_hooks