def _run_network(self, dataset_sink_mode=True): lenet = LeNet5() loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") optim = Momentum(lenet.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(lenet, loss_fn=loss, optimizer=optim, metrics={'acc': Accuracy()}) summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir) summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=1) ds_train = create_dataset(os.path.join(self.mnist_path, "train")) model.train(1, ds_train, callbacks=[summary_collector], dataset_sink_mode=dataset_sink_mode) ds_eval = create_dataset(os.path.join(self.mnist_path, "test")) model.eval(ds_eval, dataset_sink_mode=dataset_sink_mode, callbacks=[summary_collector]) self._check_summary_result(summary_dir)
def _run_network(self, dataset_sink_mode=False, num_samples=2, **kwargs): lenet = LeNet5() loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") optim = Momentum(lenet.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(lenet, loss_fn=loss, optimizer=optim, metrics={'loss': Loss()}) summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir) summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=2, **kwargs) ds_train = create_dataset(os.path.join(self.mnist_path, "train"), num_samples=num_samples) model.train(1, ds_train, callbacks=[summary_collector], dataset_sink_mode=dataset_sink_mode) ds_eval = create_dataset(os.path.join(self.mnist_path, "test")) model.eval(ds_eval, dataset_sink_mode=dataset_sink_mode, callbacks=[summary_collector]) return summary_dir
def test_compile_model_train_O2(): dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) dataset = MindDataSet(dataset_types, dataset_shapes) net = NetNoLoss(16, 16) loss = nn.MSELoss() optimizer = nn.Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, loss_fn=loss, optimizer=optimizer, metrics={"acc"}, amp_level="O2") model.train(2, dataset, dataset_sink_mode=False) with pytest.raises(ValueError): # not actual run, the metrics step will fail, check if compile ok. model.eval(dataset)
def mnist_train(epoch_size, batch_size, lr, momentum): mnist_path = "./MNIST_unzip/" ds = generate_mnist_dataset(os.path.join(mnist_path, "train"), batch_size=batch_size, repeat_size=1) network = LeNet5() net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), lr, momentum) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory="./trained_ckpt_file/", config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) LOGGER.info(TAG, "============== Starting Training ==============") model.train(epoch_size, ds, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=False) LOGGER.info(TAG, "============== Starting Testing ==============") ckpt_file_name = "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt" param_dict = load_checkpoint(ckpt_file_name) load_param_into_net(network, param_dict) ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"), batch_size=batch_size) acc = model.eval(ds_eval, dataset_sink_mode=False) LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=2, ckpt_name="lenet"): dataset_sink = context.get_context('device_target') == 'Ascend' repeat = num_epochs if dataset_sink else 1 ds_train = create_dataset(data_dir, repeat=repeat) ds_eval = create_dataset(data_dir, training=False) steps_per_epoch = ds_train.get_dataset_size() net = LeNet5() loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') opt = nn.Momentum(net.trainable_params(), lr, momentum) ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5) ckpt_cb = ModelCheckpoint(prefix=ckpt_name, directory='ckpt', config=ckpt_cfg) loss_cb = LossMonitor(steps_per_epoch) model = Model(net, loss, opt, metrics={'acc', 'loss'}) model.train(num_epochs, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=dataset_sink) metrics = model.eval(ds_eval, dataset_sink_mode=dataset_sink) print('Metrics:', metrics)
def calibration(): """ do the calibration to get the scale offset record file""" dataset = create_dataset( dataset_path=ARGS_OPT.eval_dataset, do_train=False, batch_size=config.batch_size, # pylint: disable=no-member target=ARGS_OPT.device_target) dataset = dataset.take(1) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') network = resnet(10) network.set_train(False) param_dict = load_checkpoint(ARGS_OPT.pre_trained) load_param_into_net(network, param_dict) input_data = np.random.uniform(0.0, 1.0, size=[32, 3, 224, 224]).astype(np.float32) config_file = os.path.join(CUR_DIR, './config.json') amct.create_quant_config(config_file, network, input_data) calibration_network = amct.quantize_model(config_file, network, input_data) model = Model(calibration_network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) _ = model.eval(dataset) amct.save_model('./resnet50_quant_calibration', calibration_network, input_data)
def eval_quant(): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) cfg = quant_cfg ds_eval = create_dataset(os.path.join(data_path, "test"), cfg.batch_size, 1) ckpt_path = './ckpt_lenet_quant-10_937.ckpt' # define fusion network network = LeNet5Fusion(cfg.num_classes) # convert fusion network to quantization aware network quantizer = QuantizationAwareTraining(quant_delay=0, bn_fold=False, freeze_bn=10000, per_channel=[True, False], symmetric=[True, False]) network = quantizer.quantize(network) # define loss net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # define network optimization net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # call back and monitor model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) # load quantization aware network checkpoint param_dict = load_checkpoint(ckpt_path) not_load_param = load_param_into_net(network, param_dict) if not_load_param: raise ValueError("Load param into net fail!") print("============== Starting Testing ==============") acc = model.eval(ds_eval, dataset_sink_mode=True) print("============== {} ==============".format(acc)) assert acc['Accuracy'] > 0.98
def eval_alexnet(): print("============== Starting Testing ==============") device_num = get_device_num() if device_num > 1: # context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(mode=context.GRAPH_MODE, device_target='Davinci', save_graphs=False) if config.device_target == "Ascend": context.set_context(device_id=get_device_id()) init() elif config.device_target == "GPU": init() if config.dataset_name == 'cifar10': network = AlexNet(config.num_classes, phase='test') loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = nn.Momentum(network.trainable_params(), config.learning_rate, config.momentum) ds_eval = create_dataset_cifar10(config.data_path, config.batch_size, status="test", \ target=config.device_target) param_dict = load_checkpoint(load_path) print("load checkpoint from [{}].".format(load_path)) load_param_into_net(network, param_dict) network.set_train(False) model = Model(network, loss, opt, metrics={"Accuracy": Accuracy()}) elif config.dataset_name == 'imagenet': network = AlexNet(config.num_classes, phase='test') loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") ds_eval = create_dataset_imagenet(config.data_path, config.batch_size, training=False) param_dict = load_checkpoint(load_path) print("load checkpoint from [{}].".format(load_path)) load_param_into_net(network, param_dict) network.set_train(False) model = Model(network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) else: raise ValueError("Unsupported dataset.") if ds_eval.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") result = model.eval(ds_eval, dataset_sink_mode=config.dataset_sink_mode) print("result : {}".format(result))
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=3): ds_train = create_dataset(data_dir) ds_eval = create_dataset(data_dir, training=False) net = LeNet5() loss = nn.loss.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') opt = nn.Momentum(net.trainable_params(), lr, momentum) loss_cb = LossMonitor(per_print_times=ds_train.get_dataset_size()) model = Model(net, loss, opt, metrics={'acc', 'loss'}) # dataset_sink_mode can be True when using Ascend model.train(num_epochs, ds_train, callbacks=[loss_cb], dataset_sink_mode=False) metrics = model.eval(ds_eval, dataset_sink_mode=False) print('Metrics:', metrics)
def test_train_and_eval_lenet(): context.set_context(mode=context.GRAPH_MODE, device_target="GPU") network = LeNet5(10) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training ==============") ds_train = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "train"), 32, 1) model.train(1, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=True) print("============== Starting Testing ==============") ds_eval = create_dataset(os.path.join('/home/workspace/mindspore_dataset/mnist', "test"), 32, 1) acc = model.eval(ds_eval, dataset_sink_mode=True) print("============== {} ==============".format(acc))
def eval_lenet5(): """Evaluation of lenet5""" context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) network = LeNet5(config.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), config.lr, config.momentum) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Testing ==============") load_checkpoint(config.ckpt_path, network) ds_eval = create_lenet_dataset(os.path.join(config.data_path, "test"), config.batch_size, 1) if ds_eval.get_dataset_size() == 0: raise ValueError("Please check dataset size > 0 and batch_size <= dataset size") acc = model.eval(ds_eval) print("============== {} ==============".format(acc))
def test_original_resnet50(): """ evaluate the original resnet50""" dataset = create_dataset( dataset_path=ARGS_OPT.eval_dataset, do_train=False, batch_size=config.batch_size, # pylint: disable=no-member target=ARGS_OPT.device_target) network = resnet(10) network.set_train(False) param_dict = load_checkpoint(ARGS_OPT.pre_trained) load_param_into_net(network, param_dict) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') model = Model(network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) res = model.eval(dataset) print("result for original resnet50:", res, "ckpt=", ARGS_OPT.pre_trained)
def train(Net): ds_train, ds_test = create_dataset() # 构建网络 network = Net(cfg.num_classes) # 定义模型的损失函数,优化器 net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Adam(network.trainable_params(), cfg.lr) # 训练模型 model = Model(network, loss_fn=net_loss, optimizer=net_opt, metrics={'acc': Accuracy()}) loss_cb = LossMonitor() print("============== Starting Training ==============") model.train(30, ds_train, callbacks=[loss_cb], dataset_sink_mode=True) # 验证 metric = model.eval(ds_test) print(metric) return model
def quant_resnet50(network, dataset, loss, input_data): """quantize the resnet50 """ # step2: creat the quant config json file create_quant_config('./config.json', network, input_data) # step3: do some network modification and return the modified network calibration_network = quantize_model('./config.json', network, input_data) calibration_network.set_train(False) # step4: perform the evaluation of network to do activation calibration model = Model(calibration_network, loss_fn=loss, metrics={'top_1_accuracy', 'top_5_accuracy'}) _ = model.eval(dataset, dataset_sink_mode=False) # step5: export the air file save_model('results/resnet50_quant', calibration_network, input_data) print("[INFO] the quantized AIR file has been stored at: \n {}".format( 'results/resnet50_quant.air'))
def eval_lenet(): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) cfg = nonquant_cfg ds_eval = create_dataset(os.path.join(data_path, "test"), cfg.batch_size, 1) ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt' # define fusion network network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # call back and monitor model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) # load quantization aware network checkpoint param_dict = load_checkpoint(ckpt_path) not_load_param = load_param_into_net(network, param_dict) if not_load_param: raise ValueError("Load param into net fail!") print("============== Starting Testing ==============") acc = model.eval(ds_eval, dataset_sink_mode=True) print("============== {} ==============".format(acc)) assert acc['Accuracy'] > 0.98
def mnist_train(epoch_size, batch_size, lr, momentum): context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", enable_mem_reuse=False) lr = lr momentum = momentum epoch_size = epoch_size mnist_path = "./MNIST_unzip/" ds = generate_mnist_dataset(os.path.join(mnist_path, "train"), batch_size=batch_size, repeat_size=1) network = LeNet5() network.set_train() net_loss = CrossEntropyLoss() net_opt = nn.Momentum(network.trainable_params(), lr, momentum) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory='./trained_ckpt_file/', config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) LOGGER.info(TAG, "============== Starting Training ==============") model.train(epoch_size, ds, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=False) # train LOGGER.info(TAG, "============== Starting Testing ==============") param_dict = load_checkpoint( "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt") load_param_into_net(network, param_dict) ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"), batch_size=batch_size) acc = model.eval(ds_eval) LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
class TrainerMs(TrainerBase): """Trainer mindspore class.""" def build(self): """Build the trainer by assembling the necessary components.""" super().build() if self.config.lr_scheduler.params: self.lr_scheduler = LrScheduler() dynamic_lr = self.lr_scheduler()( base_lr=self.config.optimizer.params["lr"], global_step=self.config.epochs * len(self.train_loader), total_epoch=self.config.epochs) self.optimizer = Optimizer()(model=self.model, dynamic_lr=dynamic_lr) else: self.optimizer = Optimizer()(model=self.model) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.metric_name = self.config.metric.type # Some trainer has different train batch size from valid batch self.train_metrics = None self.valid_metrics = self._init_metrics() self.ms_metrics = self.valid_metrics() if isinstance( self.valid_metrics(), dict) else { self.metric_name: self.valid_metrics() } self.ms_model = MsModel(network=self.model, loss_fn=self.loss, optimizer=self.optimizer, metrics=self.ms_metrics) def _set_condition(self): self._init_ms_context() self._init_distributed_setting() def _train_epoch(self): config_ck = CheckpointConfig( save_checkpoint_steps=self.config.save_steps, keep_checkpoint_max=1) # save the network model and parameters for subsequence fine-tuning save_path = self.get_local_worker_path(self.step_name, self.worker_id) ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path) loss_cb = LossMonitor(per_print_times=1) eval_cb = EvalCallBack(self.ms_model, self.valid_loader, self.dataset_sink_mode, self) callback_list = [ckpoint_cb, loss_cb] if self.config.mixup else [ ckpoint_cb, loss_cb, eval_cb ] try: self.ms_model.train(epoch=self.epochs, train_dataset=self.train_loader, callbacks=callback_list, dataset_sink_mode=self.dataset_sink_mode) except RuntimeError as e: logging.warning( f"failed to train the model, skip it, message: {str(e)}") def _valid_epoch(self): if self.config.mixup and self.config.loss.type == 'CrossEntropyLoss': from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits loss_fn = SoftmaxCrossEntropyWithLogits(sparse=True) self.ms_model = MsModel(network=self.model, loss_fn=loss_fn, optimizer=self.optimizer, metrics=self.ms_metrics) self.callbacks.before_valid() try: eval_metrics = self.ms_model.eval( valid_dataset=self.valid_loader, dataset_sink_mode=self.dataset_sink_mode) self.valid_metrics.update(eval_metrics) valid_logs = dict() valid_logs['cur_valid_perfs'] = self.valid_metrics.results self.callbacks.after_valid(valid_logs) except RuntimeError as exc: logging.warning( "RuntimeError occurred when eval the model. Skip eval this model." ) logging.warning("The RuntimeError message is : {}.".format(exc)) def _init_distributed_setting(self): if not self.distributed: return else: logging.info("init hccl ...") context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) hccl_init() def _init_ms_context(self): if hasattr(self.config, "execute_mode"): mode = context.PYNATIVE_MODE if self.config.execute_mode == "PYNATIVE_MODE" else context.GRAPH_MODE else: mode = context.GRAPH_MODE if vega.is_npu_device(): context.set_context(mode=mode, device_target="Ascend", device_id=int(os.environ["DEVICE_ID"])) else: context.set_context(mode=mode, device_target="CPU") self.dataset_sink_mode = True if vega.is_npu_device() else False
batch_size=cfg.batch_size) loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = nn.Momentum(network.trainable_params(), cfg.learning_rate, cfg.momentum) loss_cb = LossMonitor() model = Model(network, loss, opt, {'acc': Accuracy()}) if args.mode == 'train': print("============== Starting Training ==============") ds_train = create_dataset(args.preprocess_path, cfg.batch_size, cfg.num_epochs, True) config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=args.ckpt_path, config=config_ck) model.train(cfg.num_epochs, ds_train, callbacks=[ckpoint_cb, loss_cb]) elif args.mode == 'test': print("============== Starting Testing ==============") ds_eval = create_dataset(args.preprocess_path, cfg.batch_size, 1, False) param_dict = load_checkpoint(args.ckpt_path) load_param_into_net(network, param_dict) acc = model.eval(ds_eval) print("============== Accuracy:{} ==============".format(acc)) else: raise RuntimeError( 'mode should be train or test, rather than {}'.format(args.mode))
class Trainer(DistributedWorker): """Trainer class. :param model: input model, defaults to None :type model: tf model, optional :param id: id of the model, defaults to None :type id: int, optional :param hps: hyperparameters, defaults to None :type hps: dict, optional """ config = TrainerConfig() def __init__(self, model=None, id=None, hps=None, load_ckpt_flag=False, model_desc=None, lazy_build=True, **kwargs): super(Trainer, self).__init__() self.worker_type = WorkerTypes.TRAINER Trainer.__worker_id__ += 1 if id is not None: self._worker_id = id else: self._worker_id = Trainer.__worker_id__ # Data Memeber list of Trainer self.is_chief = True self.use_cuda = self.config.cuda self.epochs = self.config.epochs self.do_validation = True self.auto_save_ckpt = True self.auto_save_perf = True self.skip_train = False self.valid_interval = self.config.valid_interval self.hps = hps self.model = model self.model_desc = model_desc self.optimizer = None self.lr_scheduler = None self.loss = None self.use_syncbn = self.config.syncbn self.use_amp = self.config.amp self.train_metrics = None self.valid_metrics = None self.call_metrics_on_train = self.config.call_metrics_on_train self.train_verbose = self.config.train_verbose self.valid_verbose = self.config.valid_verbose self.train_report_steps = self.config.train_report_steps self.valid_report_steps = self.config.valid_report_steps self.train_loader = None self.valid_loader = None self.train_step = None self.valid_step = None self.make_batch = None self.model_fn = None self.train_input_fn = None self.valid_input_fn = None self.callbacks = None self.performance = None self.runtime = None self.visual_data = {} self.load_ckpt_flag = load_ckpt_flag self.distributed = self.config.distributed # Used by TimmTrainerCallbacks since it builds its trainer in # the before_train callback self.lazy_built = self.config.lazy_built # Indicate whether the necessary components of a trainer # has been built for running self._world_size = 1 self._rank_id = 0 self._local_rank_id = 0 self.config.kwargs = kwargs self.checkpoint_file_name = 'checkpoint.pth' self.model_pickle_file_name = 'model.pkl' worker_path = self.get_local_worker_path() self.model_path = FileOps.join_path(worker_path, self.model_pickle_file_name) self.checkpoint_file = FileOps.join_path(worker_path, self.checkpoint_file_name) self.weights_file = FileOps.join_path(worker_path, "model_{}.pth".format(self.worker_id)) self.loss_input = kwargs.get('loss_input', None) if not lazy_build: self.init_trainer() def _set_default_funcs(self): if zeus.is_torch_backend(): self.make_batch = self._default_make_batch self.train_step = self._default_train_step self.valid_step = self._default_valid_step elif zeus.is_tf_backend(): self.model_fn = self._default_model_fn self.train_input_fn = self._default_train_input_fn self.valid_input_fn = self._default_valid_input_fn def _set_condition(self): self._init_tf_session() self._init_distributed_setting() self._init_cuda_setting() self._init_tf_estimator() self._init_ms_context() def train_process(self): """Whole train process of the TrainWorker specified in config. After training, the model and validation results are saved to local_worker_path and s3_path. """ init_log(level=General.logger.level, log_file="log_worker_{}.txt".format(self.worker_id), log_path=self.local_log_path) self._set_default_funcs() self._set_condition() self._init_callbacks() self.callbacks.init_trainer() if not self.lazy_built: self.build() self._train_loop() def build(self): """Build the trainer by assembling the necessary components.""" self._init_hps(self.hps) logging.debug("Trainer Config: {}".format(self.config)) self.do_validation = self.config.with_valid self.use_syncbn = self.config.syncbn if self.use_syncbn and zeus.is_torch_backend(): self.model = apex.parallel.convert_syncbn_model(self.model) self.train_loader = self._init_dataloader(mode='train') self.valid_loader = self._init_dataloader(mode='val') self.batch_num_train = self.train_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.train_loader) self.batch_num_valid = self.valid_loader.get_dataset_size() if zeus.is_ms_backend() else len(self.valid_loader) if zeus.is_torch_backend(): self.optimizer = Optimizer()(model=self.model, distributed=self.distributed) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.lr_scheduler = LrScheduler()(self.optimizer) elif zeus.is_ms_backend(): self.optimizer = Optimizer()(model=self.model) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.metric_name = self.config.metric().type # Some trainer has different train batch size from valid batch self.train_metrics = self._init_metrics() if zeus.is_torch_backend() else None self.valid_metrics = self._init_metrics() self._init_horovod_setting() if self.use_amp and zeus.is_torch_backend(): self.model, self.optimizer = amp.initialize( self.model, self.optimizer, opt_level='O1') def init_trainer(self): """Init Train Op.""" init_log(level=General.logger.level, log_file="log_worker_{}.txt".format(self.worker_id), log_path=self.local_log_path) self._set_default_funcs() self._set_condition() self._init_callbacks() self.callbacks.init_trainer() self.init_train_op() def init_train_op(self): """Init Train Op.""" if zeus.is_tf_backend(): with self.graph.as_default(): self._init_train_op() def train(self, inputs, labels): """Train model.""" if zeus.is_tf_backend(): feed_dict = {} with self.graph.as_default(): for i in range(len(inputs)): feed_dict.update({self.inputs[i]: inputs[i]}) for i in range(len(labels)): feed_dict.update({self.labels[i]: labels[i]}) _, loss = self.sess.run([self.train_op, self.loss], feed_dict) return loss def predict(self, input): """Inference model.""" if zeus.is_tf_backend(): with self.graph.as_default(): feed_dict = {self.input: input} out = self.sess.run(self.logits, feed_dict) return out def save(self, file_name): """Save model.""" if zeus.is_tf_backend(): with self.graph.as_default(): self.actor_var.save_weights(file_name + ".npz") return file_name + ".npz" def load(self, model_name, by_name): """Load model.""" if zeus.is_tf_backend(): with self.graph.as_default(): self.actor_var.set_weights_with_npz(model_name) def set_weights(self, weights): """Set weight with memory tensor.""" if zeus.is_tf_backend(): with self.graph.as_default(): self.actor_var.set_weights(weights) def get_weights(self): """Get the weights.""" if zeus.is_tf_backend(): with self.graph.as_default(): return self.actor_var.get_weights() def _create_tensor(self, tensor_list): ret_list = [] for tensor in tensor_list: tensor_type = tensor['type'] tensor_shape = tensor['shape'] tensor_name = tensor['name'] if type(tensor_shape) is list: tf_tensor = tf.placeholder(tensor_type, name=tensor_name, shape=(None, ) + tuple(tensor_shape)) else: tf_tensor = tf.placeholder(tensor_type, name=tensor_name, shape=(None, tensor_shape)) ret_list.append(tf_tensor) return ret_list def _init_train_op(self): if self.loss_input is not None: self.inputs = self._create_tensor(self.loss_input['inputs']) self.labels = self._create_tensor(self.loss_input['labels']) self.input = self.inputs[0] logits = self.model(self.input) self.logits = logits self.actor_var = TFVariables(logits, self.sess) loss = Loss()() self.loss = loss(logits, self.labels) self.optimizer = Optimizer()(distributed=self.distributed) grads_and_var = self.optimizer.compute_gradients(self.loss) grads, var = zip(*grads_and_var) grads_and_var = list(zip(grads, var)) self.train_op = self.optimizer.apply_gradients(grads_and_var) self.sess.run(tf.initialize_all_variables()) def _init_cuda_setting(self): """Init CUDA setting.""" if not zeus.is_torch_backend(): return if not self.config.cuda: self.config.device = -1 return self.config.device = self.config.cuda if self.config.cuda is not True else 0 self.use_cuda = True if self.distributed: torch.cuda.set_device(self._local_rank_id) torch.cuda.manual_seed(self.config.seed) def _init_distributed_setting(self): if not self.distributed: return if zeus.is_npu_device(): self.npu_init = npu_ops.initialize_system() self.npu_shutdown = npu_ops.shutdown_system() self.sess.run(self.npu_init) self._world_size = hvd.size() if zeus.is_gpu_device() else get_rank_size() self._rank_id = hvd.rank() if zeus.is_gpu_device() else get_rank_id() self._local_rank_id = hvd.local_rank() if zeus.is_gpu_device() else get_local_rank_id() def _init_horovod_setting(self): """Init horovod setting.""" self.is_chief = True if self.distributed and zeus.is_torch_backend(): hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.optimizer, root_rank=0) if hvd.rank() != 0: self.is_chief = False else: self.is_chief = True def _init_hps(self, hps=None): """Load hps from file.""" if hps is not None: self.hps = hps elif self.config.hps_file is not None: desc_file = self.config.hps_file.replace("{local_base_path}", self.local_base_path) self.hps = Config(desc_file) elif self.config.hps_folder is not None: folder = self.config.hps_folder.replace("{local_base_path}", self.local_base_path) pattern = FileOps.join_path(folder, "desc_*.json") desc_file = glob.glob(pattern)[0] self.hps = Config(desc_file) if self.hps and self.hps.get('trainer'): self.config.from_json(self.hps.get('trainer')) self.epochs = self.config.epochs def _init_metrics(self, metrics=None): """Init metrics.""" if metrics is not None: return metrics else: return Metrics() def _init_dataloader(self, mode, loader=None): """Init dataloader.""" if loader is not None: return loader if mode == "train" and self.hps is not None and self.hps.get("dataset") is not None: dataset_cls = ClassFactory.get_cls(ClassType.DATASET) dataset = dataset_cls(mode=mode, hps=self.hps.get("dataset")) else: dataset_cls = ClassFactory.get_cls(ClassType.DATASET) dataset = dataset_cls(mode=mode) if self.distributed and mode == "train": dataset.set_distributed(self._world_size, self._rank_id) # adapt the dataset to specific backend dataloader = Adapter(dataset).loader return dataloader def _train_loop(self): """Do the training with data, callbacks and step functions etc.""" # Allow user to build trainer in before_train() callback, but they # should set lazy_built in configuration file to True self.callbacks.before_train() if self.skip_train: return repeat_time = 1 if zeus.is_ms_backend() else self.epochs for epoch in range(repeat_time): epoch_logs = {'train_num_batches': self.batch_num_train} if self.do_validation: epoch_logs.update({'valid_num_batches': self.batch_num_valid}) self.callbacks.before_epoch(epoch, epoch_logs) self._train_epoch() if self.do_validation and self._should_run_validation(epoch): self._valid_epoch() self.callbacks.after_epoch(epoch) self.callbacks.after_train() if self.distributed: self._shutdown_distributed() def _train_epoch(self): if zeus.is_torch_backend(): self.model.train() for batch_index, batch in enumerate(self.train_loader): batch = self.make_batch(batch) batch_logs = {'train_batch': batch} self.callbacks.before_train_step(batch_index, batch_logs) train_batch_output = self.train_step(batch) batch_logs.update(train_batch_output) if self.config.is_detection_trainer: batch_logs.update({'is_detection_trainer': True}) self.callbacks.after_train_step(batch_index, batch_logs) elif zeus.is_tf_backend(): self.estimator.train(input_fn=self.train_input_fn, steps=len(self.train_loader), hooks=self._init_logging_hook()) elif zeus.is_ms_backend(): self.ms_model = MsModel(network=self.model, loss_fn=self.loss, optimizer=self.optimizer, metrics={self.metric_name: self.valid_metrics()}) config_ck = CheckpointConfig(save_checkpoint_steps=self.config.save_steps) # save the network model and parameters for subsequence fine-tuning save_path = self.get_local_worker_path(self.step_name, self.worker_id) ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path) loss_cb = LossMonitor(per_print_times=self.config.report_freq) eval_cb = EvalCallBack(self.ms_model, self.valid_loader) self.ms_model.train(epoch=self.epochs, train_dataset=self.train_loader, callbacks=[ckpoint_cb, loss_cb, eval_cb], dataset_sink_mode=self.dataset_sink_mode) def _valid_epoch(self): self.callbacks.before_valid() valid_logs = None if zeus.is_torch_backend(): self.model.eval() with torch.no_grad(): for batch_index, batch in enumerate(self.valid_loader): batch = self.make_batch(batch) batch_logs = {'valid_batch': batch} self.callbacks.before_valid_step(batch_index, batch_logs) valid_batch_output = self.valid_step(batch) self.callbacks.after_valid_step(batch_index, valid_batch_output) elif zeus.is_tf_backend(): eval_metrics = self.estimator.evaluate(input_fn=self.valid_input_fn, steps=len(self.valid_loader)) self.valid_metrics.update(eval_metrics) valid_logs = dict() valid_logs['cur_valid_perfs'] = self.valid_metrics.results elif zeus.is_ms_backend(): eval_metrics = self.ms_model.eval(valid_dataset=self.valid_loader, dataset_sink_mode=self.dataset_sink_mode) self.valid_metrics.update(eval_metrics) valid_logs = dict() valid_logs['cur_valid_perfs'] = self.valid_metrics.results self.callbacks.after_valid(valid_logs) def _default_make_batch(self, batch): """Unpack batch to get input and target.""" input, target = batch if self.use_cuda and not self.config.is_detection_trainer: input, target = input.cuda(), target.cuda() return (input, target) def _default_train_step(self, batch): input, target = batch self.optimizer.zero_grad() output = self.model(input) loss = self.loss(output, target) if self.use_amp: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() self.optimizer.synchronize() with self.optimizer.skip_synchronize(): self.optimizer.step() else: loss.backward() if self.config.grad_clip: torch.nn.utils.clip_grad_norm_( self.model.parameters(), self.config.grad_clip) self.optimizer.step() return {'loss': loss.item(), 'train_batch_output': output, 'lr': self.lr_scheduler.get_lr()} def _default_valid_step(self, batch): input, target = batch if self.config.is_detection_trainer: output = self.model(input, forward_train=False) else: output = self.model(input) return {'valid_batch_output': output} def _init_minimize_op(self, loss, global_step, var_list=None): """Init loss minimize operation, include loss scale method.""" loss_scale = self.config.loss_scale if self.use_amp else 1. if loss_scale != 1: scaled_grad_vars = self.optimizer.compute_gradients(loss * loss_scale, var_list=var_list) unscaled_grad_vars = [] for grad, var in scaled_grad_vars: unscaled_grad_vars.append((grad, var) if grad is None else (grad / loss_scale, var)) minimize_op = self.optimizer.apply_gradients(unscaled_grad_vars, global_step) else: grad_vars = self.optimizer.compute_gradients(loss, var_list=var_list) minimize_op = self.optimizer.apply_gradients(grad_vars, global_step) return minimize_op def _default_train_input_fn(self): return self.train_loader.input_fn() def _default_valid_input_fn(self): return self.valid_loader.input_fn() def _default_model_fn(self, features, labels, mode): """Define model_fn used by TensorFlow Estimator. :params features: input features :type features: tensorflow tensors :params labels: label data :type labels: tensorflow tensors :params mode: mode of estimator :type mode: tf.estimator.ModeKeys :return: tensorflow EstimatorSpec :rtype: tf.estimator.EstimatorSpec """ logging.info('model function action') self.model.training = mode == tf.estimator.ModeKeys.TRAIN logits = self.model(features) logits = tf.cast(logits, tf.float32) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() loss = self.loss(logits, labels) train_op = None if mode == tf.estimator.ModeKeys.TRAIN: global_step = tf.compat.v1.train.get_or_create_global_step() epoch = tf.cast(global_step, tf.float32) / tf.cast(len(self.train_loader), tf.float32) self.optimizer = Optimizer()(distributed=self.distributed) self.lr_scheduler = LrScheduler()(optimizer=self.optimizer) self.lr_scheduler.step(epoch) update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS) loss_scale = self.config.loss_scale if self.use_amp else 1 minimize_op = self.optimizer.step(loss, loss_scale, global_step) train_op = tf.group(minimize_op, update_ops) eval_metric_ops = None if mode == tf.estimator.ModeKeys.EVAL: eval_metric_ops = self.valid_metrics(logits, labels) return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op, eval_metric_ops=eval_metric_ops) def _should_run_validation(self, epoch): # Zero valid_interval means doesn't run _valid_loop of the trainer # and user may provide _valid_loop in other callbacks if self.valid_interval == 0: return False else: return epoch % self.valid_interval == 0 or (epoch + 1) == self.epochs def _init_callbacks(self): disables = [] customs = self.config.callbacks or [] if customs and not isinstance(customs, list): customs = [customs] if not self.config.model_statistics: disables.append('ModelStatistics') self.callbacks = CallbackList(customs, disables) self.callbacks.set_trainer(self) def _metric_average(self, val, name): """Do metric average. :param val: input value :param name: metric name :return: """ tensor = torch.tensor(val) avg_tensor = hvd.allreduce(tensor, name=name) return avg_tensor.item() @property def _first_rank(self): """Check if the first rank.""" if self.distributed and hvd.rank() != 0: return False else: return True def _backup(self): """Backup result worker folder.""" if self.need_backup is True and self.backup_base_path is not None: backup_worker_path = FileOps.join_path( self.backup_base_path, self.get_worker_subpath()) FileOps.copy_folder( self.get_local_worker_path(self.step_name, self.worker_id), backup_worker_path) def _save_visual_data(self, is_train=True, pfms=None, loss=None, lr=None): # TODO Will move to metric base class later. for _name, value in pfms.items(): if is_train: _name = "{}_{}".format("t", _name) else: _name = "{}_{}".format("v", _name) if isinstance(value, list): for i, _item in enumerate(value): _name = "{}_{}".format(_name, i) self.visual_data[_name] = _item.data.item() elif isinstance(value, dict): for k, v in value.keys(): _name = "{}_{}".format(_name, k) self.visual_data[_name] = v elif value is not None: self.visual_data[_name] = value.data.item() if loss is not None: self.visual_data["loss"] = loss if lr is not None: self.visual_data["lr"] = lr def _init_tf_estimator(self): """Init tensorflow estimator.""" if not zeus.is_tf_backend(): return sess_config = self._init_session_config() if zeus.is_gpu_device(): self._init_gpu_estimator(sess_config) elif zeus.is_npu_device(): self._init_npu_estimator(sess_config) def _init_tf_session(self): if not zeus.is_tf_backend(): return sess_config = self._init_session_config() self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.compat.v1.Session(config=sess_config) def _init_session_config(self): sess_config = self._init_gpu_session_config() if zeus.is_gpu_device() else \ self._init_npu_session_config() return sess_config def _init_logging_hook(self): logging_hook = [] if zeus.is_gpu_device() and self.distributed: logging_hook += [hvd.BroadcastGlobalVariablesHook(0)] return logging_hook def _init_gpu_estimator(self, sess_config): """Init tensorflow estimator.""" distribution = None if not self.distributed and General._parallel and General.devices_per_trainer > 1: distribution = tf.contrib.distribute.MirroredStrategy() config = tf.estimator.RunConfig(model_dir=self.get_local_worker_path(), save_checkpoints_steps=self.config.save_steps, log_step_count_steps=self.config.report_freq, session_config=None if distribution else sess_config, train_distribute=distribution) self.estimator = tf.estimator.Estimator(model_fn=self.model_fn, config=config) def _init_npu_estimator(self, sess_config): model_dir = self.get_local_worker_path() config = NPURunConfig(model_dir=model_dir, save_checkpoints_steps=self.config.save_steps, log_step_count_steps=self.config.report_freq, session_config=sess_config, enable_data_pre_proc=True, iterations_per_loop=1) self.estimator = NPUEstimator(model_fn=self.model_fn, config=config) def _init_gpu_session_config(self): sess_config = tf.compat.v1.ConfigProto() sess_config.gpu_options.allow_growth = True if self.distributed: sess_config.gpu_options.visible_device_list = str(hvd.local_rank()) return sess_config def _init_npu_session_config(self): sess_config = tf.ConfigProto() sess_config.graph_options.rewrite_options.remapping = RewriterConfig.OFF custom_op = sess_config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" if self.use_amp: custom_op.parameter_map["precision_mode"].s = tf.compat.as_bytes("allow_mix_precision") custom_op.parameter_map["use_off_line"].b = True # custom_op.parameter_map['hcom_parallel'].b = True # custom_op.parameter_map["enable_data_pre_proc"].b = True # custom_op.parameter_map["mix_compile_mode"].b = True # mixed calculation # custom_op.parameter_map["min_group_size"].b = 1 return sess_config def _init_ms_context(self): if not zeus.is_ms_backend(): return if zeus.is_npu_device(): context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") else: context.set_context(mode=context.GRAPH_MODE, device_target="CPU") self.dataset_sink_mode = True if zeus.is_npu_device() else False def _shutdown_distributed(self): if zeus.is_npu_device() and self.distributed: self.sess.run(self.npu_shutdown) self.sess.close()
def valid(self, valid_loader): """Validate one step of mode. :param loader: valid data loader """ if zeus.is_torch_backend(): import torch from zeus.metrics.pytorch import Metrics metrics = Metrics(self.config.metric) self.model.eval() data_num = 0 latency_sum = 0.0 with torch.no_grad(): for step, batch in enumerate(valid_loader): if isinstance(batch, list) or isinstance(batch, tuple): data = batch[0] target = batch[1] else: raise ValueError("The dataset format must be tuple or list," "but get {}.".format(type(batch))) if self.config.cuda: data, target = data.cuda(), target.cuda() self.model = self.model.cuda() time_start = time.time() logits = self.model(data) latency_sum += time.time() - time_start metrics(logits, target) n = data.size(0) data_num += n if step % self.config.report_freq == 0: logging.info("step [{}/{}], valid metric [{}]".format( step + 1, len(valid_loader), str(metrics.results))) latency = latency_sum / data_num elif zeus.is_tf_backend(): from zeus.metrics.tensorflow.metrics import Metrics metrics = Metrics(self.config.metric) estimator = self._init_tf_estimator() time_start = time.time() eval_metrics = estimator.evaluate(input_fn=valid_loader.input_fn, steps=len(valid_loader)) latency = (time.time() - time_start) / (len(valid_loader) * valid_loader.args.batch_size) metrics.update(eval_metrics) elif zeus.is_ms_backend(): from zeus.metrics.mindspore.metrics import Metrics from mindspore.train import Model as MsModel from .utils import FakeLoss metrics = Metrics(self.config.metric) metric_name = self.config.metric().type dataset_sink_mode = True if zeus.is_npu_device() else False # when eval, the loss_fn is not needed actually, but when initilized, the loss_fn can't be None ms_model = MsModel(network=self.model, loss_fn=FakeLoss(), metrics={metric_name: metrics()}) time_start = time.time() eval_metrics = ms_model.eval(valid_dataset=valid_loader, callbacks=None, dataset_sink_mode=dataset_sink_mode) for batch in valid_loader.create_dict_iterator(): batch_size = batch["image"].shape[0] break latency = (time.time() - time_start) / (valid_loader.get_dataset_size() * batch_size) metrics.update(eval_metrics) pfms = metrics.results if self.config.evaluate_latency: pfms["latency"] = latency logging.info("evaluate performance: {}".format(pfms)) return pfms
# 训练模型 model = Model(network, loss_fn=net_loss, optimizer=net_opt, metrics={"acc"}) loss_cb = LossMonitor(per_print_times=int(cfg.train_size / cfg.batch_size)) config_ck = CheckpointConfig(save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=cfg.output_prefix, directory=cfg.output_directory, config=config_ck) print("============== Starting Training ==============") model.train(cfg.epoch_size, ds_train, callbacks=[ckpoint_cb, loss_cb], dataset_sink_mode=True) # 使用测试集评估模型,打印总体准确率 metric = model.eval(ds_test) print(metric) # 预测 test_ = ds_test.create_dict_iterator().get_next() test = Tensor(test_['x'], mindspore.float32) predictions = model.predict(test) softmax = nn.Softmax() predictions = softmax(predictions) predictions = predictions.asnumpy() for i in range(15): p_np = predictions[i, :] p_list = p_np.tolist() print('第' + str(i) + '个sample预测结果:', p_list.index(max(p_list)), ' 真实结果:', test_['y'][i])
device_target=args.device_target) ds_eval = create_dataset(os.path.join(args.data_path, "test"), cfg.batch_size, 1) # define fusion network network = LeNet5Fusion(cfg.num_classes) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, quant_delay=0, bn_fold=False, freeze_bn=10000, per_channel=[True, False]) # define loss net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # define network optimization net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # call back and monitor model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) # load quantization aware network checkpoint param_dict = load_checkpoint(args.ckpt_path) not_load_param = load_param_into_net(network, param_dict) if not_load_param: raise ValueError("Load param into net fail!") print("============== Starting Testing ==============") acc = model.eval(ds_eval, dataset_sink_mode=True) print("============== {} ==============".format(acc))
class TrainerMs(TrainerBase): """Trainer mindspore class.""" def build(self): """Build the trainer by assembling the necessary components.""" super().build() self.optimizer = Optimizer()(model=self.model) if hasattr(self.model, 'add_loss'): loss_cls = Loss()() self.model.add_loss(loss_cls) self.loss = self.model.overall_loss() else: self.loss = Loss()() self.metric_name = self.config.metric.type # Some trainer has different train batch size from valid batch self.train_metrics = None self.valid_metrics = self._init_metrics() def _set_condition(self): self._init_distributed_setting() self._init_ms_context() def _train_epoch(self): self.ms_model = MsModel( network=self.model, loss_fn=self.loss, optimizer=self.optimizer, metrics={self.metric_name: self.valid_metrics()}) config_ck = CheckpointConfig( save_checkpoint_steps=self.config.save_steps) # save the network model and parameters for subsequence fine-tuning save_path = self.get_local_worker_path(self.step_name, self.worker_id) ckpoint_cb = ModelCheckpoint(config=config_ck, directory=save_path) loss_cb = LossMonitor(per_print_times=self.config.report_freq) eval_cb = EvalCallBack(self.ms_model, self.valid_loader, self.dataset_sink_mode) self.ms_model.train(epoch=self.epochs, train_dataset=self.train_loader, callbacks=[ckpoint_cb, loss_cb, eval_cb], dataset_sink_mode=self.dataset_sink_mode) def _valid_epoch(self): self.callbacks.before_valid() valid_logs = None eval_metrics = self.ms_model.eval( valid_dataset=self.valid_loader, dataset_sink_mode=self.dataset_sink_mode) self.valid_metrics.update(eval_metrics) valid_logs = dict() valid_logs['cur_valid_perfs'] = self.valid_metrics.results self.callbacks.after_valid(valid_logs) def _init_distributed_setting(self): if not self.distributed: return def _init_ms_context(self): if zeus.is_npu_device(): context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") else: context.set_context(mode=context.GRAPH_MODE, device_target="CPU") self.dataset_sink_mode = True if zeus.is_npu_device() else False
path where the trained ckpt file') parser.add_argument('--dataset_sink_mode', type=bool, default=False, help='dataset_sink_mode is False or True') args = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") repeat_size = cfg.epoch_size net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Testing ==============") param_dict = load_checkpoint(args.ckpt_path) load_param_into_net(network, param_dict) ds_eval = create_dataset(os.path.join(args.data_path, "test"), cfg.batch_size, 1) acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode) print("============== {} ==============".format(acc))
# apply DatasetOps buffer_size = 10000 mnist_ds = mnist_ds.shuffle( buffer_size=buffer_size) # 10000 as in LeNet train script mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True) mnist_ds = mnist_ds.repeat(repeat_size) return mnist_ds if __name__ == "__main__": network = LeNet5(10) network.set_param_ps() net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) ds_train = create_dataset(os.path.join(dataset_path, "train"), 32, 1) model.train(1, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=False) ds_eval = create_dataset(os.path.join(dataset_path, "test"), 32, 1) acc = model.eval(ds_eval, dataset_sink_mode=False) print("Accuracy:", acc['Accuracy']) assert acc['Accuracy'] > 0.93