train_data_path = "./datasets/MNIST_Data/train" eval_data_path = "./datasets/MNIST_Data/test" model_path = "./models/ckpt/custom_debugging_info/" net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") repeat_size = 1 network = LeNet5() metrics = { 'accuracy': nn.Accuracy(), 'loss': nn.Loss(), 'precision': nn.Precision(), 'recall': nn.Recall(), 'f1_score': nn.F1() } net_opt = nn.Momentum(network.trainable_params(), lr, momentum) config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=model_path, config=config_ck) model = Model(network, net_loss, net_opt, metrics=metrics) print("============== Starting Training ==============") ds_train = create_dataset(train_data_path, repeat_size=repeat_size) stop_cb = StopAtTime(run_time=0.6) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(375), stop_cb], dataset_sink_mode=False) print("============== Starting Testing ==============") ds_eval = create_dataset(eval_data_path, repeat_size=repeat_size) acc = model.eval(ds_eval, dataset_sink_mode=False) print("============== Accuracy:{} ==============".format(acc))
""" import mindspore.nn as nn from mindspore import context, Model from mindspore.train.callback import LossMonitor from mindspore.nn.metrics import Accuracy from src.lenet import LeNet5 from src.datasets import create_dataset if __name__ == "__main__": context.set_context(mode=context.GRAPH_MODE, device_target="GPU") ds_train = create_dataset("./datasets/MNIST_Data/train", 32) ds_eval = create_dataset("./datasets/MNIST_Data/test", 32) # Initialize network network = LeNet5(10) # Define Loss and Optimizer net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), learning_rate=0.01, momentum=0.9) # amp_leval=O2 in GPU, amp_leval=O3 in Ascend, O0 is without mixed precision model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}, amp_level="O2") # Run training model.train(epoch=1, callbacks=[LossMonitor()], train_dataset=ds_train) # Run training acc = model.eval(ds_eval, dataset_sink_mode=False) print("====Accuracy====:", acc)
warmup_epochs=cfg.warmup_epochs, total_epochs=cfg.num_epochs, steps_per_epoch=ds_train.get_dataset_size(), lr_adjust_epoch=cfg.lr_adjust_epoch)) else: lr = cfg.learning_rate opt = nn.Momentum(network.trainable_params(), lr, cfg.momentum) loss_cb = LossMonitor() model = Model(network, loss, opt, {'acc': Accuracy()}) print("============== Starting Training ==============") config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="lstm", directory=args.ckpt_path, config=config_ck) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) if args.device_target == "CPU": model.train(cfg.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb], dataset_sink_mode=False) else: model.train(cfg.num_epochs, ds_train, callbacks=[time_cb, ckpoint_cb, loss_cb]) print("============== Training Success ==============")
def test_train(): ''' finetune function ''' target = args_opt.device_target if target == "Ascend": devid = int(os.getenv('DEVICE_ID')) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=devid) poetry, tokenizer, keep_words = create_tokenizer() print(len(keep_words)) dataset = create_poetry_dataset(bert_net_cfg.batch_size, poetry, tokenizer) num_tokens = 3191 poetrymodel = BertPoetryModel(bert_net_cfg, True, num_tokens, dropout_prob=0.1) netwithloss = BertPoetry(poetrymodel, bert_net_cfg, True, dropout_prob=0.1) callback = LossCallBack(poetrymodel) # optimizer steps_per_epoch = dataset.get_dataset_size() print("============ steps_per_epoch is {}".format(steps_per_epoch)) lr_schedule = BertLearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=1000, decay_steps=cfg.epoch_num * steps_per_epoch, power=cfg.AdamWeightDecay.power) optimizer = AdamWeightDecay(netwithloss.trainable_params(), lr_schedule) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix=cfg.ckpt_prefix, directory=cfg.ckpt_dir, config=ckpt_config) param_dict = load_checkpoint(cfg.pre_training_ckpt) new_dict = {} # load corresponding rows of embedding_lookup for key in param_dict: if "bert_embedding_lookup" not in key: new_dict[key] = param_dict[key] else: value = param_dict[key] np_value = value.data.asnumpy() np_value = np_value[keep_words] tensor_value = Tensor(np_value, mstype.float32) parameter_value = Parameter(tensor_value, name=key) new_dict[key] = parameter_value load_param_into_net(netwithloss, new_dict) update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = BertPoetryCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) model = Model(netwithgrads) model.train(cfg.epoch_num, dataset, callbacks=[callback, ckpoint_cb], dataset_sink_mode=True)
args = parser.parse_args() context.set_context(mode=context.GRAPH_MODE, save_graphs=False, device_target='Ascend') ds_train = create_dataset(args.dataset_path, cfg.batch_size) network = Seq2Seq(cfg) network = WithLossCell(network, cfg) optimizer = nn.Adam(network.trainable_params(), learning_rate=cfg.learning_rate, beta1=0.9, beta2=0.98) model = Model(network, optimizer=optimizer) loss_cb = LossMonitor() config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="gru", directory=args.ckpt_save_path, config=config_ck) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) callbacks = [time_cb, ckpoint_cb, loss_cb] model.train(cfg.num_epochs, ds_train, callbacks=callbacks, dataset_sink_mode=False)
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) if cache_enable: config.full_batch = True print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) ds.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size * get_group_size(), data_type=dataset_type) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) if cache_enable: config.stra_ckpt = os.path.join( config.stra_ckpt + "-{}".format(get_rank()), "strategy.ckpt") context.set_auto_parallel_context( strategy_ckpt_save_file=config.stra_ckpt) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) if _is_role_worker(): if cache_enable: ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * epochs, keep_checkpoint_max=1, integrated_save=False) else: ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) else: ckptconfig = CheckpointConfig(save_checkpoint_steps=1, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if get_rank() == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=bool(parameter_server and cache_enable))
charge = charge.reshape((-1, 129)).astype(np.float32) # define the model net = Mdnn() lr = 0.0001 decay_rate = 0.8 epoch_size = 1000 batch_size = 500 total_step = epoch_size * batch_size step_per_epoch = 100 decay_epoch = epoch_size lr_rate = nn.exponential_decay_lr(lr, decay_rate, total_step, step_per_epoch, decay_epoch) net_loss = nn.loss.MSELoss(reduction='mean') net_opt = nn.Adam(net.trainable_params(), learning_rate=lr_rate) model = Model(net, net_loss, net_opt) ds_train = create_dataset(radial_angular, charge, batchsize=batch_size) model_params = net.trainable_params() net.set_train() init_weight(net) # config files path = './params/' config_ck = CheckpointConfig(save_checkpoint_steps=100, keep_checkpoint_max=10) ckpoint_cb = ModelCheckpoint(prefix="mdnn_best", directory=path, config=config_ck) steps_loss = {"step": [], "loss_value": []} step_loss_acc_info = StepLossAccInfo(model, ds_train, steps_loss) # train the model model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(100)])
dataset_size = train_dataset.get_dataset_size() time_cb = TimeMonitor(data_size=dataset_size) callback = [time_cb, LossCallBack()] if config.enable_save_ckpt: config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_steps, keep_checkpoint_max=config.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_deeplabv3', config=config_ck) callback.append(ckpoint_cb) net = deeplabv3_resnet50( config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size], infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates, decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride, fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid) net.set_train() model_fine_tune(args_opt, net, 'layer') loss = OhemLoss(config.seg_num_classes, config.ignore_label) opt = Momentum(filter( lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay) model = Model(net, loss, opt) model.train(config.epoch_size, train_dataset, callback)
def train_eval(config): """ test evaluate """ data_path = config.data_path + config.dataset_type ckpt_path = config.ckpt_path epochs = config.epochs batch_size = config.batch_size if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() train_model = Model(train_net) train_callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) train_model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), train_callback, ckpoint_cb ]) # data download print('Download data from modelarts server to obs.') mox.file.copy_parallel(src_url=config.ckpt_path, dst_url=config.train_url) param_dict = load_checkpoint(find_ckpt(ckpt_path)) load_param_into_net(eval_net, param_dict) auc_metric = AUCMetric() eval_model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(eval_model, ds_eval, auc_metric, config) eval_model.eval(ds_eval, callbacks=eval_callback)
self.fc3 = nn.Dense(84, 10, weight_init='TruncatedNormal', bias_init='TruncatedNormal') def construct(self, x): x = self.conv1(x) x = self.relu(x) x = self.maxpool(x) x = self.conv2(x) x = self.relu(x) x = self.maxpool(x) x = self.reshape(x, (32, 400)) x = self.fc1(x) x = self.relu(x) x = self.fc2(x) x = self.relu(x) x = self.fc3(x) return x if __name__ == '__main__': import numpy as np context.set_context(mode=context.GRAPH_MODE, save_graphs=True) dataset = create_dataset('/fzl/mnist/train') net = LeNet() data = Tensor(np.ones((32, 1, 32, 32)), mindspore.float32) y = net(data) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction='mean') lr = 0.01 momentum = 0.9 opt = nn.Momentum(net.trainable_params(), lr, momentum) mod = Model(net, loss_fn=net_loss, optimizer=opt) mod.train(10, dataset, callbacks=[LossMonitor(),], dataset_sink_mode=False)
def train_net(args_opt, cross_valid_ind=1, epochs=400, batch_size=16, lr=0.0001, cfg=None): rank = 0 group_size = 1 data_dir = args_opt.data_url run_distribute = args_opt.run_distribute if run_distribute: init() group_size = get_group_size() rank = get_rank() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=False) need_slice = False if cfg['model'] == 'unet_medical': net = UNetMedical(n_channels=cfg['num_channels'], n_classes=cfg['num_classes']) elif cfg['model'] == 'unet_nested': net = NestedUNet(in_channel=cfg['num_channels'], n_class=cfg['num_classes'], use_deconv=cfg['use_deconv'], use_bn=cfg['use_bn'], use_ds=cfg['use_ds']) need_slice = cfg['use_ds'] elif cfg['model'] == 'unet_simple': net = UNet(in_channel=cfg['num_channels'], n_class=cfg['num_classes']) else: raise ValueError("Unsupported model: {}".format(cfg['model'])) if cfg['resume']: param_dict = load_checkpoint(cfg['resume_ckpt']) if cfg['transfer_training']: filter_checkpoint_parameter_by_list(param_dict, cfg['filter_weight']) load_param_into_net(net, param_dict) if 'use_ds' in cfg and cfg['use_ds']: criterion = MultiCrossEntropyWithLogits() else: criterion = CrossEntropyWithLogits() if 'dataset' in cfg and cfg['dataset'] == "Cell_nuclei": repeat = cfg['repeat'] dataset_sink_mode = True per_print_times = 0 train_dataset = create_cell_nuclei_dataset(data_dir, cfg['img_size'], repeat, batch_size, is_train=True, augment=True, split=0.8, rank=rank, group_size=group_size) valid_dataset = create_cell_nuclei_dataset( data_dir, cfg['img_size'], 1, 1, is_train=False, eval_resize=cfg["eval_resize"], split=0.8, python_multiprocessing=False) else: repeat = cfg['repeat'] dataset_sink_mode = False per_print_times = 1 train_dataset, valid_dataset = create_dataset( data_dir, repeat, batch_size, True, cross_valid_ind, run_distribute, cfg["crop"], cfg['img_size']) train_data_size = train_dataset.get_dataset_size() print("dataset length is:", train_data_size) ckpt_config = CheckpointConfig( save_checkpoint_steps=train_data_size, keep_checkpoint_max=cfg['keep_checkpoint_max']) ckpoint_cb = ModelCheckpoint(prefix='ckpt_{}_adam'.format(cfg['model']), directory='./ckpt_{}/'.format(device_id), config=ckpt_config) optimizer = nn.Adam(params=net.trainable_params(), learning_rate=lr, weight_decay=cfg['weight_decay'], loss_scale=cfg['loss_scale']) loss_scale_manager = mindspore.train.loss_scale_manager.FixedLossScaleManager( cfg['FixedLossScaleManager'], False) model = Model(net, loss_fn=criterion, loss_scale_manager=loss_scale_manager, optimizer=optimizer, amp_level="O3") print("============== Starting Training ==============") callbacks = [ StepLossTimeMonitor(batch_size=batch_size, per_print_times=per_print_times), ckpoint_cb ] if args_opt.run_eval: eval_model = Model(UnetEval(net, need_slice=need_slice), loss_fn=TempLoss(), metrics={"dice_coeff": dice_coeff(cfg_unet, False)}) eval_param_dict = { "model": eval_model, "dataset": valid_dataset, "metrics_name": args_opt.eval_metrics } eval_cb = EvalCallBack(apply_eval, eval_param_dict, interval=args_opt.eval_interval, eval_start_epoch=args_opt.eval_start_epoch, save_best_ckpt=True, ckpt_directory='./ckpt_{}/'.format(device_id), besk_ckpt_name="best.ckpt", metrics_name=args_opt.eval_metrics) callbacks.append(eval_cb) model.train(int(epochs / repeat), train_dataset, callbacks=callbacks, dataset_sink_mode=dataset_sink_mode) print("============== End Training ==============")
def train_and_eval(config): """ test_train_eval """ np.random.seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) if config.device_target == "Ascend": ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) elif config.device_target == "GPU": ckpoint_cb = ModelCheckpoint(prefix='widedeep_train_' + str(get_rank()), directory=config.ckpt_path, config=ckptconfig) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ], dataset_sink_mode=(not parameter_server))
input_data = input_data.batch(batch_size) input_data = input_data.repeat(repeat_size) return input_data class LinearNet(nn.Cell): def __init__(self): super(LinearNet, self).__init__() self.fc = nn.Dense(1, 1, Normal(0.02), Normal(0.02)) def construct(self, x): x = self.fc(x) return x if __name__ == "__main__": data_number = 1600 batch_number = 16 repeat_number = 1 lr = 0.005 momentum = 0.9 net = LinearNet() net_loss = nn.loss.MSELoss() opt = nn.Momentum(net.trainable_params(), lr, momentum) model = Model(net, net_loss, opt) ds_train = create_dataset(data_number, batch_size=batch_number, repeat_size=repeat_number) model.train(1, ds_train, callbacks=LossMonitor(), dataset_sink_mode=False) for param in net.trainable_params(): print(param, param.asnumpy())
dataset_size = train_dataset.get_dataset_size() time_cb = TimeMonitor(data_size=dataset_size) callback = [time_cb, LossCallBack()] if args_opt.enable_save_ckpt == "true": config_ck = CheckpointConfig( save_checkpoint_steps=args_opt.save_checkpoint_steps, keep_checkpoint_max=args_opt.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='checkpoint_deeplabv3', config=config_ck) callback.append(ckpoint_cb) net = deeplabv3_resnet50( config.seg_num_classes, [args_opt.batch_size, 3, args_opt.crop_size, args_opt.crop_size], infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates, decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride, fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid) net.set_train() model_fine_tune(args_opt, net, 'layer') loss = OhemLoss(config.seg_num_classes, config.ignore_label) opt = Momentum(filter( lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay) model = Model(net, loss, opt) model.train(args_opt.epoch_size, train_dataset, callback)
def test_train(): """train entry method""" if args.is_distributed: if args.device_target == "Ascend": init() context.set_context(device_id=args.device_id) elif args.device_target == "GPU": init() args.rank = get_rank() args.group_size = get_group_size() device_num = args.group_size context.reset_auto_parallel_context() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, parameter_broadcast=True, gradients_mean=True) else: context.set_context(device_id=args.device_id) context.set_context(mode=context.GRAPH_MODE, device_target=args.device_target) if not os.path.exists(args.output_path): os.makedirs(args.output_path) layers = cfg.layers num_factors = cfg.num_factors epochs = args.train_epochs ds_train, num_train_users, num_train_items = create_dataset( test_train=True, data_dir=args.data_path, dataset=args.dataset, train_epochs=1, batch_size=args.batch_size, num_neg=args.num_neg) print("ds_train.size: {}".format(ds_train.get_dataset_size())) ncf_net = NCFModel(num_users=num_train_users, num_items=num_train_items, num_factors=num_factors, model_layers=layers, mf_regularization=0, mlp_reg_layers=[0.0, 0.0, 0.0, 0.0], mf_dim=16) loss_net = NetWithLossClass(ncf_net) train_net = TrainStepWrap(loss_net, ds_train.get_dataset_size() * (epochs + 1)) train_net.set_train() model = Model(train_net) callback = LossMonitor(per_print_times=ds_train.get_dataset_size()) ckpt_config = CheckpointConfig( save_checkpoint_steps=(4970845 + args.batch_size - 1) // (args.batch_size), keep_checkpoint_max=100) ckpoint_cb = ModelCheckpoint(prefix='NCF', directory=args.checkpoint_path, config=ckpt_config) model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb ], dataset_sink_mode=True)
def inception_v4_train(): """ Train Inceptionv4 in data parallelism """ print('epoch_size: {} batch_size: {} class_num {}'.format( config.epoch_size, config.batch_size, config.num_classes)) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") context.set_context(device_id=args.device_id) context.set_context(enable_graph_kernel=False) rank = 0 if device_num > 1: init(backend_name='hccl') rank = get_rank() context.set_auto_parallel_context( device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True, all_reduce_fusion_config=[200, 400]) # create dataset train_dataset = create_dataset(dataset_path=args.dataset_path, do_train=True, repeat_num=1, batch_size=config.batch_size) train_step_size = train_dataset.get_dataset_size() # create model net = Inceptionv4(classes=config.num_classes) # loss loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") # learning rate lr = Tensor( generate_cosine_lr(steps_per_epoch=train_step_size, total_epochs=config.epoch_size)) decayed_params = [] no_decayed_params = [] for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: decayed_params.append(param) else: no_decayed_params.append(param) for param in net.trainable_params(): if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name: param.set_data( initializer(XavierUniform(), param.data.shape, param.data.dtype)) group_params = [{ 'params': decayed_params, 'weight_decay': config.weight_decay }, { 'params': no_decayed_params }, { 'order_params': net.trainable_params() }] opt = RMSProp(group_params, lr, decay=config.decay, epsilon=config.epsilon, weight_decay=config.weight_decay, momentum=config.momentum, loss_scale=config.loss_scale) if args.device_id == 0: print(lr) print(train_step_size) if args.resume: ckpt = load_checkpoint(args.resume) load_param_into_net(net, ckpt) loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc', 'top_1_accuracy', 'top_5_accuracy'}, loss_scale_manager=loss_scale_manager, amp_level=config.amp_level) # define callbacks performance_cb = TimeMonitor(data_size=train_step_size) loss_cb = LossMonitor(per_print_times=train_step_size) ckp_save_step = config.save_checkpoint_epochs * train_step_size config_ck = CheckpointConfig( save_checkpoint_steps=ckp_save_step, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix=f"inceptionV4-train-rank{rank}", directory='ckpts_rank_' + str(rank), config=config_ck) callbacks = [performance_cb, loss_cb] if device_num > 1 and config.is_save_on_master: if args.device_id == 0: callbacks.append(ckpoint_cb) else: callbacks.append(ckpoint_cb) # train model model.train(config.epoch_size, train_dataset, callbacks=callbacks, dataset_sink_mode=True)
def train_and_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) de.config.set_seed(1) ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size * get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size * get_group_size()) else: ds_train = create_dataset(data_path, train_mode=True, epochs=epochs, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) ds_eval = create_dataset(data_path, train_mode=False, epochs=epochs + 1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size()) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) context.set_auto_parallel_context( strategy_ckpt_save_file="./strategy_train.ckpt") model.train(epochs, ds_train, callbacks=[ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ])
train_dataset = create_dataset("./datasets/MNIST_Data/train") eval_dataset = create_dataset("./datasets/MNIST_Data/test") print("========== The Training Model is Defined. ==========") # train the model and export the encrypted CheckPoint file through Callback config_ck = CheckpointConfig(save_checkpoint_steps=1875, keep_checkpoint_max=10, enc_key=b'0123456789ABCDEF', enc_mode='AES-GCM') ckpoint_cb = ModelCheckpoint(prefix='lenet_enc', directory=None, config=config_ck) model.train(10, train_dataset, dataset_sink_mode=False, callbacks=[ckpoint_cb, LossMonitor(1875)]) acc = model.eval(eval_dataset, dataset_sink_mode=False) print("Accuracy: {}".format(acc["Accuracy"])) # export the encrypted CheckPoint file through save_checkpoint save_checkpoint(network, 'lenet_enc.ckpt', enc_key=b'0123456789ABCDEF', enc_mode='AES-GCM') # load encrypted CheckPoint file and eval param_dict = load_checkpoint('lenet_enc-10_1875.ckpt', dec_key=b'0123456789ABCDEF', dec_mode='AES-GCM') load_param_into_net(network, param_dict)
"e2e_dump_settings": { "enable": True, "trans_flag": False } } with open("./data_dump.json", "w", encoding="GBK") as f: json.dump(data_dump, f) os.environ['MINDSPORE_DUMP_CONFIG'] = abspath + "/data_dump.json" def set_log_info(): os.environ['GLOG_v'] = '1' os.environ['GLOG_logtostderr'] = '1' os.environ['logger_maxBytes'] = '5242880' os.environ['GLOG_log_dir'] = 'D:/' if os.name == "nt" else '/var/log/mindspore' os.environ['logger_backupCount'] = '10' print(logger.get_log_config()) if __name__ == "__main__": set_dump_info() set_log_info() context.set_context(mode=context.GRAPH_MODE) train_dataset = create_train_dataset() eval_dataset = create_eval_dataset() net = Net() net_opt = Momentum(net.trainable_params(), 0.01, 0.9) net_loss = SoftmaxCrossEntropyWithLogits(reduction='mean') model = Model(network=net, loss_fn=net_loss, optimizer=net_opt, metrics={'Accuracy': nn.Accuracy()}) model.train(epoch=100, train_dataset=train_dataset, callbacks=[LossMonitor(), StopAtTime(3), SaveCallback(model, eval_dataset)])
net_loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") repeat_size = 1 net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) if args.mode == 'train': # train ds_train = create_dataset(os.path.join(args.data_path, args.mode), batch_size=cfg.batch_size, repeat_size=repeat_size) print("============== Starting Training ==============") config_ck = CheckpointConfig( save_checkpoint_steps=cfg.save_checkpoint_steps, keep_checkpoint_max=cfg.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", config=config_ck, directory=args.ckpt_path) model.train(cfg['epoch_size'], ds_train, callbacks=[ckpoint_cb, LossMonitor()], dataset_sink_mode=args.dataset_sink_mode) elif args.mode == 'test': # test print("============== Starting Testing ==============") param_dict = load_checkpoint(args.ckpt_path) load_param_into_net(network, param_dict) ds_eval = create_dataset(os.path.join(args.data_path, "test"), 32, 1) acc = model.eval(ds_eval, dataset_sink_mode=args.dataset_sink_mode) print("============== Accuracy:{} ==============".format(acc)) else: raise RuntimeError( 'mode should be train or test, rather than {}'.format(args.mode))
init() epoch_size = args_opt.epoch_size net = resnet50(args_opt.batch_size, args_opt.num_classes) ls = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=ls, optimizer=opt, metrics={'acc'}) # as for train, users could use model.train if args_opt.do_train: dataset = create_dataset() batch_num = dataset.get_dataset_size() config_ck = CheckpointConfig(save_checkpoint_steps=batch_num, keep_checkpoint_max=35) ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10", directory="./", config=config_ck) loss_cb = LossMonitor() model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb]) # as for evaluation, users could use model.eval if args_opt.do_eval: if args_opt.checkpoint_path: param_dict = load_checkpoint(args_opt.checkpoint_path) load_param_into_net(net, param_dict) eval_dataset = create_dataset(training=False) res = model.eval(eval_dataset) print("result: ", res)
context.set_context( mode=context.PYNATIVE_MODE, device_target="GPU", enable_mem_reuse=False) # save_graphs=True, save_graphs_path="./graph/") # save_ms_model=True) network = LeNet5(cfg.num_classes) net_loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True, reduction="mean") net_opt = nn.Momentum(network.trainable_params(), cfg.lr, cfg.momentum) # config_ck = CheckpointConfig(save_checkpoint_steps=1, # keep_checkpoint_max=cfg.keep_checkpoint_max) # ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory="./ckpt", config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) # # # summary_writer = SummaryRecord(log_dir='./summary13', network=network) # summary_callback = SummaryStep(summary_writer, flush_step=1) # # train_callback = TrainLineage(summary_writer) saver_callback = DataSaverCallback() ds_train = create_dataset(os.path.join(args.data_path, "train"), cfg.batch_size, cfg.epoch_size) print("============== Starting Training ==============") model.train(cfg['epoch_size'], ds_train, # callbacks=[LossMonitor()], callbacks=[saver_callback], dataset_sink_mode=args.dataset_sink_mode) # summary_writer.close()
for para in train_net.trainable_params(): if fix_weight_layer in para.name: para.requires_grad = False if __name__ == "__main__": start_time = time.time() epoch_size = 3 args_opt.base_size = config.crop_size args_opt.crop_size = config.crop_size train_dataset = create_dataset(args_opt, args_opt.data_url, 1, config.batch_size, usage="train", shuffle=False) dataset_size = train_dataset.get_dataset_size() callback = LossCallBack(dataset_size) net = deeplabv3_resnet50(config.seg_num_classes, [config.batch_size, 3, args_opt.crop_size, args_opt.crop_size], infer_scale_sizes=config.eval_scales, atrous_rates=config.atrous_rates, decoder_output_stride=config.decoder_output_stride, output_stride=config.output_stride, fine_tune_batch_norm=config.fine_tune_batch_norm, image_pyramid=config.image_pyramid) net.set_train() model_fine_tune(args_opt, net, 'layer') loss = OhemLoss(config.seg_num_classes, config.ignore_label) opt = Momentum(filter(lambda x: 'beta' not in x.name and 'gamma' not in x.name and 'depth' not in x.name and 'bias' not in x.name, net.trainable_params()), learning_rate=config.learning_rate, momentum=config.momentum, weight_decay=config.weight_decay) model = Model(net, loss, opt) model.train(epoch_size, train_dataset, callback) print(time.time() - start_time) print("expect loss: ", callback.loss / 3) print("expect time: ", callback.time) expect_loss = 0.5 expect_time = 35 assert callback.loss.asnumpy() / 3 <= expect_loss assert callback.time <= expect_time
""" import mindspore.nn as nn from mindspore.nn import Momentum, SoftmaxCrossEntropyWithLogits from mindspore import Model, context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor from src.dataset import create_train_dataset, create_eval_dataset from src.net import Net if __name__ == "__main__": context.set_context(mode=context.GRAPH_MODE) ds_train = create_train_dataset() ds_eval = create_eval_dataset() net = Net() net_opt = Momentum(net.trainable_params(), 0.01, 0.9) net_loss = SoftmaxCrossEntropyWithLogits(reduction='mean') metrics = { 'Accuracy': nn.Accuracy(), 'Loss': nn.Loss(), 'Precision': nn.Precision(), 'Recall': nn.Recall(), 'F1_score': nn.F1() } config_ck = CheckpointConfig(save_checkpoint_steps=1000, keep_checkpoint_max=10) ckpoint = ModelCheckpoint(prefix="CKPT", config=config_ck) model = Model(network=net, loss_fn=net_loss, optimizer=net_opt, metrics=metrics) model.train(epoch=2, train_dataset=ds_train, callbacks=[ckpoint, LossMonitor()]) result = model.eval(ds_eval) print(result)
# set args, train it context.set_context(mode=context.GRAPH_MODE, device_target="CPU") train_data_path = "./datasets/MNIST_Data/train" eval_data_path = "./datasets/MNIST_Data/test" ckpt_save_dir = "./lenet_ckpt" epoch_size = 10 eval_per_epoch = 2 repeat = 1 train_data = create_dataset(train_data_path, repeat_size=repeat) eval_data = create_dataset(eval_data_path, repeat_size=repeat) # define the net network = LeNet5() # define the loss function net_loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean') # define the optimizer net_opt = nn.Momentum(network.trainable_params(), learning_rate=0.01, momentum=0.9) config_ck = CheckpointConfig(save_checkpoint_steps=eval_per_epoch * 1875, keep_checkpoint_max=15) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=ckpt_save_dir, config=config_ck) model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) epoch_per_eval = {"epoch": [], "acc": []} eval_cb = EvalCallBack(model, eval_data, eval_per_epoch, epoch_per_eval) model.train(epoch_size, train_data, callbacks=[ckpoint_cb, LossMonitor(375), eval_cb], dataset_sink_mode=False)
# clean up old run files before in Linux os.system('rm -rf {0}*.ckpt {0}*.meta {0}*.pb'.format(model_path)) # define the model model = Model(network, net_loss, net_opt, metrics={"Accuracy": Accuracy()}) # save the network model and parameters for subsquenece fine-tuning config_ck = CheckpointConfig(save_checkpoint_steps=375, keep_checkpoint_max=16) # group layers into an object whith tarining and evaluation features ckpoint_cb = ModelCheckpoint(prefix="checkpoint_lenet", directory=model_path, config=config_ck) steps_loss = {"step": [], "loss_value": []} steps_eval = {"step": [], "acc": []} # collect the steps,loss and accuracy infofmation step_loss_acc_info = StepLossAccInfo(model, ds_eval, steps_loss, steps_eval) model.train(epoch_size, ds_train, callbacks=[ckpoint_cb, LossMonitor(125), step_loss_acc_info], dataset_sink_mode=False) loss_show(steps_loss) eval_show(steps_eval)
def train_and_eval(config): """ test_train_eval """ data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 host_device_mix = bool(config.host_device_mix) print("epochs is {}".format(epochs)) if config.full_batch: context.set_auto_parallel_context(full_batch=True) de.config.set_seed(1) if config.field_slice: compute_manual_shape(config, get_group_size()) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type, manual_shape=config.manual_shape, target_column=config.field_size) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type, manual_shape=config.manual_shape, target_column=config.field_size) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size*get_group_size(), data_type=dataset_type) else: ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack( model, ds_eval, auc_metric, config, host_device_mix=host_device_mix) callback = LossCallBack(config=config, per_print_times=20) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size()*epochs, keep_checkpoint_max=5, integrated_save=False) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) context.set_auto_parallel_context(strategy_ckpt_save_file=config.stra_ckpt) callback_list = [TimeMonitor( ds_train.get_dataset_size()), eval_callback, callback] if not host_device_mix: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(not host_device_mix))
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 parameter_server = bool(config.parameter_server) cache_enable = config.vocab_cache_size > 0 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) if _is_role_worker(): if cache_enable: ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size() * epochs, keep_checkpoint_max=1) else: ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) else: ckptconfig = CheckpointConfig(save_checkpoint_steps=1, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path, config=ckptconfig) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb ] model.train(epochs, ds_train, callbacks=callback_list, dataset_sink_mode=(parameter_server and cache_enable))
def train_and_eval(config): """ test_train_eval """ set_seed(1000) data_path = config.data_path batch_size = config.batch_size sparse = config.sparse epochs = config.epochs if config.dataset_type == "tfrecord": dataset_type = DataType.TFRECORD elif config.dataset_type == "mindrecord": dataset_type = DataType.MINDRECORD else: dataset_type = DataType.H5 print("epochs is {}".format(epochs)) ds_train = create_dataset(data_path, train_mode=True, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) ds_eval = create_dataset(data_path, train_mode=False, epochs=1, batch_size=batch_size, rank_id=get_rank(), rank_size=get_group_size(), data_type=dataset_type) print("ds_train.size: {}".format(ds_train.get_dataset_size())) print("ds_eval.size: {}".format(ds_eval.get_dataset_size())) net_builder = ModelBuilder() train_net, eval_net = net_builder.get_net(config) train_net.set_train() auc_metric = AUCMetric() model = Model(train_net, eval_network=eval_net, metrics={"auc": auc_metric}) eval_callback = EvalCallBack(model, ds_eval, auc_metric, config) callback = LossCallBack(config=config) ckptconfig = CheckpointConfig( save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=config.ckpt_path + '/ckpt_' + str(get_rank()) + '/', config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) callback_list = [ TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback ] if get_rank() == 0: callback_list.append(ckpoint_cb) model.train(epochs, ds_train, callbacks=callback_list, sink_size=ds_train.get_dataset_size(), dataset_sink_mode=(not sparse))
self.fc2 = nn.Dense(hidden_size, 1) self.sig = ops.Sigmoid() def construct(self, x): x = self.fc1(x) x = self.sig(x) x = self.fc2(x) return x m = Net(HIDDEN_SIZE) optim = nn.Momentum(m.trainable_params(), 0.05, 0.9) loss = nn.MSELoss() loss_cb = LossMonitor() model = Model(m, loss, optim, {'acc': Accuracy()}) time_cb = TimeMonitor(data_size=ds_train.get_dataset_size()) model.train(ITERATIONS, ds_train, callbacks=[time_cb, loss_cb], dataset_sink_mode=False) print("TF", model.predict(Tensor([[1, 0]], mindspore.float32)).asnumpy()) print("FF", model.predict(Tensor([[0, 0]], mindspore.float32)).asnumpy()) print("TT", model.predict(Tensor([[1, 1]], mindspore.float32)).asnumpy()) print("FT", model.predict(Tensor([[0, 1]], mindspore.float32)).asnumpy())