def test_batchnorm_batch_parallel(): num_classes = 1001 batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 rank_size = 0 predict = Tensor(np.ones([batch_size, 3, 224, 224]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = DatasetLenet(predict, label, 2) net = batchnorm_net(num_classes) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) loss.softmax_cross_entropy.set_strategy(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) context.reset_auto_parallel_context() context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) context.set_context(mode=context.GRAPH_MODE) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False)
def test_gpu_profiler(self): context.set_context(mode=context.GRAPH_MODE, device_target="GPU") profiler = Profiler(output_path='data') profiler_name = os.listdir(os.path.join(os.getcwd(), 'data'))[0] self.profiler_path = os.path.join(os.getcwd(), f'data/{profiler_name}/') ds_train = create_dataset(os.path.join(self.mnist_path, "train")) if ds_train.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") lenet = LeNet5() loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") optim = Momentum(lenet.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(lenet, loss_fn=loss, optimizer=optim, metrics={'acc': Accuracy()}) model.train(1, ds_train, dataset_sink_mode=True) profiler.analyse() self._check_gpu_profiling_file()
def test_dp_monitor_gpu(): context.set_context(mode=context.GRAPH_MODE, device_target="GPU") batch_size = 16 batches = 128 epochs = 1 rdp = PrivacyMonitorFactory.create(policy='rdp', num_samples=60000, batch_size=batch_size, initial_noise_multiplier=0.4, noise_decay_rate=6e-5) suggest_epoch = rdp.max_epoch_suggest() LOGGER.info(TAG, 'The recommended maximum training epochs is: %s', suggest_epoch) network = LeNet5() net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), 0.01, 0.9) model = Model(network, net_loss, net_opt) LOGGER.info(TAG, "============== Starting Training ==============") ds1 = ds.GeneratorDataset(dataset_generator(batch_size, batches), ["data", "label"]) ds1.set_dataset_size(batch_size * batches) model.train(epochs, ds1, callbacks=[rdp], dataset_sink_mode=False)
def mnist_train(epoch_size, batch_size, lr, momentum): mnist_path = "./MNIST_unzip/" ds = generate_mnist_dataset(os.path.join(mnist_path, "train"), batch_size=batch_size, repeat_size=1) network = LeNet5() net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), lr, momentum) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory="./trained_ckpt_file/", config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) LOGGER.info(TAG, "============== Starting Training ==============") model.train(epoch_size, ds, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=False) LOGGER.info(TAG, "============== Starting Testing ==============") ckpt_file_name = "trained_ckpt_file/checkpoint_lenet-10_1875.ckpt" param_dict = load_checkpoint(ckpt_file_name) load_param_into_net(network, param_dict) ds_eval = generate_mnist_dataset(os.path.join(mnist_path, "test"), batch_size=batch_size) acc = model.eval(ds_eval, dataset_sink_mode=False) LOGGER.info(TAG, "============== Accuracy: %s ==============", acc)
def _run_network(self, dataset_sink_mode=True): lenet = LeNet5() loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") optim = Momentum(lenet.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(lenet, loss_fn=loss, optimizer=optim, metrics={'acc': Accuracy()}) summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir) summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=1) ds_train = create_dataset(os.path.join(self.mnist_path, "train")) model.train(1, ds_train, callbacks=[summary_collector], dataset_sink_mode=dataset_sink_mode) ds_eval = create_dataset(os.path.join(self.mnist_path, "test")) model.eval(ds_eval, dataset_sink_mode=dataset_sink_mode, callbacks=[summary_collector]) self._check_summary_result(summary_dir)
def _get_epistemic_uncertainty_model(self): """ Get the model which can obtain the epistemic uncertainty. """ if self.epi_uncer_model is None: self.epi_uncer_model = EpistemicUncertaintyModel(self.epi_model) if self.epi_uncer_model.drop_count == 0 and self.epi_train_dataset is not None: if self.task_type == 'classification': net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = Adam(self.epi_uncer_model.trainable_params()) model = Model(self.epi_uncer_model, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) else: net_loss = MSELoss() net_opt = Adam(self.epi_uncer_model.trainable_params()) model = Model(self.epi_uncer_model, net_loss, net_opt, metrics={"MSE": MSE()}) if self.save_model: config_ck = CheckpointConfig(keep_checkpoint_max=self.epochs) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_epi_uncer_model', directory=self.epi_uncer_model_path, config=config_ck) model.train(self.epochs, self.epi_train_dataset, dataset_sink_mode=False, callbacks=[ckpoint_cb, LossMonitor()]) elif self.epi_uncer_model_path is None: model.train(self.epochs, self.epi_train_dataset, dataset_sink_mode=False, callbacks=[LossMonitor()]) else: uncer_param_dict = load_checkpoint(self.epi_uncer_model_path) load_param_into_net(self.epi_uncer_model, uncer_param_dict)
def test_summary_ops(self): """Test summary operators.""" ds_train = create_mnist_dataset('train', num_samples=1, batch_size=1) ds_train_iter = ds_train.create_dict_iterator() expected_data = next(ds_train_iter)['image'].asnumpy() net = LeNet5() loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") optim = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, loss_fn=loss, optimizer=optim, metrics={'loss': Loss()}) model.train(1, ds_train, dataset_sink_mode=False) summary_data = _get_summary_tensor_data() image_data = summary_data['x[:Image]'].asnumpy() tensor_data = summary_data['x[:Tensor]'].asnumpy() x_fc3 = summary_data['x_fc3[:Scalar]'].asnumpy() assert np.allclose(expected_data, image_data) assert np.allclose(expected_data, tensor_data) assert not np.allclose(0, x_fc3)
def train_lenet_quant(): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) cfg = quant_cfg ckpt_path = './ckpt_lenet_noquant-10_1875.ckpt' ds_train = create_dataset(os.path.join(data_path, "train"), cfg.batch_size, 1) step_size = ds_train.get_dataset_size() # define fusion network network = LeNet5Fusion(cfg.num_classes) # load quantization aware network checkpoint param_dict = load_checkpoint(ckpt_path) load_nonquant_param_into_quant_net(network, param_dict) # convert fusion network to quantization aware network network = quant.convert_quant_network(network, quant_delay=900, bn_fold=False, per_channel=[True, False], symmetric=[False, False]) # define network loss net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # define network optimization net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # call back and monitor config_ckpt = CheckpointConfig(save_checkpoint_steps=cfg.epoch_size * step_size, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpt_callback = ModelCheckpoint(prefix="ckpt_lenet_quant", config=config_ckpt) # define model model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) print("============== Starting Training ==============") model.train(cfg['epoch_size'], ds_train, callbacks=[ckpt_callback, LossMonitor()], dataset_sink_mode=True) print("============== End Training ==============")
def test_callbacks_non_sink_batch_size2(): logger.info("test_callbacks_non_sink_batch_size2") events = [] my_cb1 = MyWaitedCallback(events, 2) my_cb2 = MyMSCallback(events) arr = [1, 2, 3, 4] data = ds.NumpySlicesDataset((arr, arr), column_names=["c1", "c2"], shuffle=False) data = data.map(operations=(lambda x: x), callbacks=my_cb1) data = data.batch(2) net = Net() model = Model(net) model.train(2, data, dataset_sink_mode=False, callbacks=[my_cb2, my_cb1]) expected_synced_events = [ 'ms_step_end_1_1', 'ds_step_begin_1_3', 'ms_step_end_1_2', 'ms_epoch_end_1_2', 'ds_epoch_begin_2_4', 'ds_step_begin_2_5', 'ms_step_end_2_3', 'ds_step_begin_2_7', 'ms_step_end_2_4', 'ms_epoch_end_2_4' ] assert events == expected_synced_events
def main(data_path, device_target='Ascend', summary_dir='./summary_dir', learning_rate=0.01): context.set_context(mode=context.GRAPH_MODE, device_target=device_target) momentum = 0.9 epoch_size = 1 batch_size = 32 network = LeNet5() network.set_train() net_loss = CrossEntropyLoss() net_opt = nn.Momentum(network.trainable_params(), learning_rate, momentum) model = Model(network, net_loss, net_opt) # Init SummaryCollector callback to record summary data in model.train or model.eval summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=10) ds = create_dataset(os.path.join(data_path, "train"), batch_size=batch_size) print("============== Starting Training ==============") model.train(epoch_size, ds, callbacks=[summary_collector], dataset_sink_mode=False) print("============== Train End =====================")
def test_auto_parallel_arithmetic_model(): class NetOneHot(nn.Cell): def __init__(self): super().__init__() self.matmul = P.MatMul() self.one_hot = P.OneHot().shard(((1, 8), (), ())) self.on_value = Tensor(1.0, ms.float32) self.off_value = Tensor(0.0, ms.float32) self.matmul2 = P.MatMul() self.w = Parameter(Tensor(np.zeros([32, 64]).astype(np.float32)), "weight", requires_grad=True) def construct(self, x, b): out = self.matmul(x, self.w) out1 = self.one_hot(b, 64, self.on_value, self.off_value) out2 = self.matmul2(out, out1) return out2 context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=8, global_rank=0, parallel_mode=ParallelMode.AUTO_PARALLEL) net = NetOneHot() x = Tensor(np.ones([8, 32]), dtype=ms.float32) b = Tensor(np.ones([8]), dtype=ms.int32) dataset = Dataset(x, b, 2) opt = Momentum(net.trainable_params(), 0.1, 0.9) model = Model(net, optimizer=opt) model.train(2, dataset, dataset_sink_mode=False)
def train(data_dir, lr=0.01, momentum=0.9, num_epochs=2, ckpt_name="lenet"): dataset_sink = context.get_context('device_target') == 'Ascend' repeat = num_epochs if dataset_sink else 1 ds_train = create_dataset(data_dir, repeat=repeat) ds_eval = create_dataset(data_dir, training=False) steps_per_epoch = ds_train.get_dataset_size() net = LeNet5() loss = nn.loss.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') opt = nn.Momentum(net.trainable_params(), lr, momentum) ckpt_cfg = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=5) ckpt_cb = ModelCheckpoint(prefix=ckpt_name, directory='ckpt', config=ckpt_cfg) loss_cb = LossMonitor(steps_per_epoch) model = Model(net, loss, opt, metrics={'acc', 'loss'}) model.train(num_epochs, ds_train, callbacks=[ckpt_cb, loss_cb], dataset_sink_mode=dataset_sink) metrics = model.eval(ds_eval, dataset_sink_mode=dataset_sink) print('Metrics:', metrics)
def test_trains(): init() lr = 0.1 momentum = 0.9 max_epoch = 20 device_number = 32 batch_size_per_device = 128 input_channels = 256 out_channels = 512 context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_number) predict = Tensor(np.ones([batch_size_per_device, input_channels]), dtype=ms.float32) dataset = Dataset(predict, 4) network = fc_with_initialize(input_channels, out_channels) network.set_train() criterion = get_loss(batch_size_per_device * device_number) train_network = BuildTrainNetwork(network, criterion) train_network.set_train() opt = Momentum(train_network.trainable_params(), lr, momentum) train_net = TrainOneStepCell(train_network, opt).set_train() model = Model(train_net) model.train(max_epoch, dataset, dataset_sink_mode=False) context.reset_auto_parallel_context()
def loss_scale_manager_common(strategy1): learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=8) predict = Tensor(np.ones([32, 128]), dtype=ms.float32) label = Tensor(np.ones([32]), dtype=ms.int32) dataset = Dataset(predict, label, 2) net = all_to_all_net(strategy1) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) loss.softmax_cross_entropy.set_strategy(((8, 1), (8, 1))) opt = Momentum(net.trainable_params(), learning_rate, momentum) scale_manager = DynamicLossScaleManager(32, 2, 2000) model = Model(net, loss, opt, loss_scale_manager=scale_manager) # if no GE exists, outputs = self._train_network(*next_element) outputs inputs tensor. try: model.train(epoch_size, dataset, dataset_sink_mode=False) except TypeError: pass else: assert False
def mix_parallel_matmul_trains(self): parallel_callback = ModelCallback() matmul_stra = ((device_num, 1), (1, 1)) reduce_max_stra = ((1, device_num), ) sub_stra = ((device_num, 1), (device_num, 1)) exp_stra = ((1, device_num), ) reduce_sum_stra = ((1, device_num), ) div_stra = ((1, device_num), (1, 1)) log_stra = ((1, device_num), ) mul_stra = ((1, device_num), (1, device_num)) sum_cross_entropy_stra = ((1, device_num), ) mul2_stra = ((), (device_num, )) reduce_mean_stra = ((device_num, ), ) onehot_stra = ((1, device_num), (), ()) loss_stra_list = [ exp_stra, reduce_sum_stra, onehot_stra, div_stra, log_stra, sum_cross_entropy_stra, mul_stra, mul2_stra, reduce_mean_stra, reduce_max_stra, sub_stra ] context.set_auto_parallel_context(parallel_mode="auto_parallel") net = MatmulNet(matmul_stra=matmul_stra, loss_stra_list=loss_stra_list) optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_part, self.label_part) model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False) loss_value = np.array(parallel_callback.loss_list) return loss_value
def net_trains(criterion, rank): init() lr = 0.1 momentum = 0.9 max_epoch = 20 input_channels = 256 out_channels = 512 context.set_context(mode=context.GRAPH_MODE, save_graphs=False) context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_number, global_rank=rank) predict = Tensor(np.ones([batch_size_per_device, input_channels]), dtype=ms.float32) dataset = Dataset(predict, 4) network = fc_with_initialize(input_channels, out_channels) network.set_train() train_network = BuildTrainNetwork(network, criterion) train_network.set_train() opt = Momentum(train_network.trainable_params(), lr, momentum) train_net = TrainOneStepCell(train_network, opt).set_train() model = Model(train_net) model.train(max_epoch, dataset, dataset_sink_mode=False) context.reset_auto_parallel_context()
def test_row_tensor_model_train(): class Net(nn.Cell): def __init__(self, in_features, out_features): super(Net, self).__init__() self.weight = Parameter(Tensor( np.ones([out_features, in_features]).astype(np.float32)), name="weight") self.add = P.TensorAdd() self.cast = P.Cast() self.flag = True def construct(self, inputs, label): x = self.add(inputs, self.weight) if self.flag: x = self.cast(x, mstype.float32) return x dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) dataset = MindDataSet(dataset_types, dataset_shapes) net = Net(16, 16) net.set_train() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) model.train(2, dataset, dataset_sink_mode=False)
def test_resnet_model_parallel(): num_classes = 1024 batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=dev_num, global_rank=0) context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=dev_num) context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 64, 112, 112]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = DatasetLenet(predict, label, 2) net = resnet_model_parallel_net(num_classes) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') loss.softmax_cross_entropy.shard(((dev_num, 1), (dev_num, 1))) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False)
def _get_aleatoric_uncertainty_model(self): """ Get the model which can obtain the aleatoric uncertainty. """ if self.ale_uncer_model is None: self.ale_uncer_model = AleatoricUncertaintyModel( self.ale_model, self.num_classes, self.task_type) net_loss = AleatoricLoss(self.task_type) net_opt = Adam(self.ale_uncer_model.trainable_params()) if self.task_type == 'classification': model = Model(self.ale_uncer_model, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) else: model = Model(self.ale_uncer_model, net_loss, net_opt, metrics={"MSE": MSE()}) if self.save_model: config_ck = CheckpointConfig(keep_checkpoint_max=self.epochs) ckpoint_cb = ModelCheckpoint( prefix='checkpoint_ale_uncer_model', directory=self.ale_uncer_model_path, config=config_ck) model.train(self.epochs, self.ale_train_dataset, callbacks=[ckpoint_cb, LossMonitor()]) elif self.ale_uncer_model_path is None: model.train(self.epochs, self.ale_train_dataset, callbacks=[LossMonitor()]) else: uncer_param_dict = load_checkpoint(self.ale_uncer_model_path) load_param_into_net(self.ale_uncer_model, uncer_param_dict)
def _run_network(self, dataset_sink_mode=False, num_samples=2, **kwargs): lenet = LeNet5() loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") optim = Momentum(lenet.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(lenet, loss_fn=loss, optimizer=optim, metrics={'loss': Loss()}) summary_dir = tempfile.mkdtemp(dir=self.base_summary_dir) summary_collector = SummaryCollector(summary_dir=summary_dir, collect_freq=2, **kwargs) ds_train = create_dataset(os.path.join(self.mnist_path, "train"), num_samples=num_samples) model.train(1, ds_train, callbacks=[summary_collector], dataset_sink_mode=dataset_sink_mode) ds_eval = create_dataset(os.path.join(self.mnist_path, "test")) model.eval(ds_eval, dataset_sink_mode=dataset_sink_mode, callbacks=[summary_collector]) return summary_dir
def test_callbacks_non_sink_mismatch_size(): logger.info("test_callbacks_non_sink_mismatch_size") default_timeout = ds.config.get_callback_timeout() ds.config.set_callback_timeout(1) events = [] my_cb1 = MyWaitedCallback(events, 2) my_cb2 = MyMSCallback(events) arr = [1, 2, 3, 4] data = ds.NumpySlicesDataset((arr, arr), column_names=["c1", "c2"], shuffle=False) data = data.map(operations=(lambda x: x), callbacks=my_cb1) data = data.batch(3) net = Net() model = Model(net) with pytest.raises(Exception) as err: model.train(2, data, dataset_sink_mode=False, callbacks=[my_cb2, my_cb1]) assert "RuntimeError: ds_step_begin timed out after 1 second(s)" in str( err.value) ds.config.set_callback_timeout(default_timeout)
def train_common(net): batch_size = 32 learning_rate = 0.1 momentum = 0.9 epoch_size = 2 device_num = 4 context.reset_auto_parallel_context() auto_parallel_context().set_enable_all_reduce_fusion( enable_all_reduce_fusion=True) context.set_auto_parallel_context( parallel_mode=ParallelMode.SEMI_AUTO_PARALLEL, device_num=device_num, parameter_broadcast=False) context.set_context(mode=context.GRAPH_MODE) predict = Tensor(np.ones([batch_size, 128]), dtype=ms.float32) label = Tensor(np.ones([batch_size]), dtype=ms.int32) dataset = Dataset(predict, label, 2) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, loss, opt) model.train(epoch_size, dataset, dataset_sink_mode=False) allreduce_fusion_dict = _executor._get_allreduce_fusion( model._train_network) print(allreduce_fusion_dict) return allreduce_fusion_dict
def train(): context.set_context( mode=context.GRAPH_MODE, device_target="Ascend", #save_graphs=True, #save_graphs_path="/home/work/user-job-dir/EAST/", #enable_reduce_precision=False, #device_id=5 ) epoch = 600 my_dataset.download_dataset() train_img_path = os.path.abspath('/cache/train_img') train_gt_path = os.path.abspath('/cache/train_gt') #my_dataset.data_to_mindrecord_byte_image(train_img_path, train_gt_path, mindrecord_dir='/cache', prefix='icdar_train.mindrecord',file_num=1) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file=['icdar_train.mindrecord0','icdar_train.mindrecord1','icdar_train.mindrecord2','icdar_train.mindrecord3'], batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=8, length=512, scale=0.25) #dataset = my_dataset.create_icdar_train_dataset(mindrecord_file='/cache/icdar_train.mindrecord', batch_size=32, repeat_num=epoch, # is_training=True, num_parallel_workers=24, length=512, scale=0.25) #dataset = my_dataset.create_demo_dataset(batch_size=21, repeat_num=2) #train_img_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/img') #train_gt_path = os.path.abspath('/home/licheng/gpzlx1/ICDAR_2015/train/gt') dataset = datasetV2.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) #dataset = datasetV3.create_icdar_train_dataset(train_img_path, train_gt_path, batch_size=14, repeat_num=1, is_training=True, num_parallel_workers=24) dataset_size = dataset.get_dataset_size() print("Create dataset done!, dataset_size: ", dataset_size) #east = EAST.EAST() net = EAST_VGG.EAST() #ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * 20) #ckpoint_cb = ModelCheckpoint(prefix='EAST', directory='/cache', config=ckpt_config) milestone = [100, 300] learning_rates = [1e-3, 1e-4] lr = piecewise_constant_lr(milestone, learning_rates) opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), learning_rate=lr) net = my_loss.EASTWithLossCell(net) net = my_loss.TrainingWrapper(net, opt) net.set_train(True) callback = [TimeMonitor(data_size=dataset_size), LossMonitor()] #, ckpoint_cb] model = Model(net) dataset_sink_mode = False print("start trainig") model.train(epoch, dataset, callbacks=callback, dataset_sink_mode=dataset_sink_mode)
def compile_net(net): context.set_context(save_graphs=False) learning_rate = 0.1 momentum = 0.9 epoch_size = 2 dataset = Dataset(_x, _b) opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, optimizer=opt) model.train(epoch_size, dataset, dataset_sink_mode=False) context.reset_auto_parallel_context()
def single_matmul_trains(self): single_callback = ModelCallback() net = MatmulNet() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_full, self.label_full) model.train(epoch_size, dataset, callbacks=single_callback, dataset_sink_mode=False) loss_value = np.array(single_callback.loss_list) return loss_value
def data_parallel_matmul_trains(self): parallel_callback = ModelCallback() context.set_auto_parallel_context(parallel_mode="semi_auto_parallel") net = MatmulNet() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, optimizer=optimizer) epoch_size = 6 dataset = Dataset(self.input_part, self.label_part) model.train(epoch_size, dataset, callbacks=parallel_callback, dataset_sink_mode=False) loss_value = np.array(parallel_callback.loss_list) return loss_value
def compile_net(net): context.set_context(save_graphs=True) learning_rate = 0.1 momentum = 0.9 epoch_size = 2 dataset = Dataset(_x, _b) loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') opt = Momentum(net.trainable_params(), learning_rate, momentum) model = Model(net, loss, optimizer=opt, amp_level="O2") model.train(epoch_size, dataset, dataset_sink_mode=False) context.reset_auto_parallel_context()
def test_compile_f16_model_train(): dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) dataset = MindDataSet(dataset_types, dataset_shapes) net = NetFP16(16, 16) net.set_train() loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, loss_fn=loss, optimizer=optimizer, metrics=None) model.train(2, dataset, dataset_sink_mode=False)
def main(): # We currently support pynative mode with device GPU context.set_context(mode=context.PYNATIVE_MODE, device_target='GPU') epoch_size = 1 batch_size = 32 mnist_path = "/data/chengzi/zhusuan-mindspore/data/MNIST" repeat_size = 1 # Define model parameters z_dim = 40 x_dim = 32 * 32 # create the network generator = Generator(x_dim, z_dim, batch_size) variational = Variational(x_dim, z_dim, batch_size) network = zs.variational.ELBO(generator, variational) # define loss # learning rate setting lr = 0.001 net_loss = ReduceMeanLoss() # define the optimizer print(network.trainable_params()[0]) net_opt = nn.Adam(network.trainable_params(), lr) model = Model(network, net_loss, net_opt) ds_train = create_dataset(os.path.join(mnist_path, "train"), batch_size, repeat_size) model.train(epoch_size, ds_train, callbacks=[LossMonitor()], dataset_sink_mode=False) print(network.trainable_params()[0]) iterator = ds_train.create_tuple_iterator() for item in iterator: batch_x = item[0].reshape(32, 32 * 32) break z, _ = network.variational(Tensor(batch_x), None, None) sample, _, _, _ = network.generator(None, z, None) sample = sample.asnumpy() save_img(batch_x, 'result/origin_x.png') save_img(sample, 'result/reconstruct_x.png') for i in range(4): sample, _, _, _ = network.generator(None, None, None) sample = sample.asnumpy() samples = sample if i == 0 else np.concatenate([samples, sample], axis=0) save_img(samples, 'result/sample_x.png', num=4 * batch_size)
def test_compile_f16_model_train_fixed(): dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) dataset = MindDataSet(dataset_types, dataset_shapes) net = NetFP16(16, 16) net.set_train() scale_manager = FixedLossScaleManager() loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(net, loss_fn=loss, optimizer=optimizer, metrics=None, loss_scale_manager=scale_manager) model.train(2, dataset)