load_path = args_opt.pre_trained if args_opt.task_type == "Pretraining": print("load backbone vgg16 ckpt {}".format(args_opt.pre_trained)) param_dict = load_checkpoint(load_path) for item in list(param_dict.keys()): if not item.startswith('vgg16_feature_extractor'): param_dict.pop(item) load_param_into_net(net, param_dict) else: if load_path != "": print("load pretrain ckpt {}".format(args_opt.pre_trained)) param_dict = load_checkpoint(load_path) load_param_into_net(net, param_dict) loss = LossNet() lr = Tensor(dynamic_lr(training_cfg, dataset_size), mstype.float32) opt = Momentum(params=net.trainable_params(), learning_rate=lr, momentum=config.momentum,\ weight_decay=config.weight_decay, loss_scale=config.loss_scale) net_with_loss = WithLossCell(net, loss) if args_opt.run_distribute: net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale, reduce_flag=True, mean=True, degree=device_num) else: net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale) time_cb = TimeMonitor(data_size=dataset_size) loss_cb = LossCallBack(rank_id=rank) cb = [time_cb, loss_cb]
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ Do train Args: dataset: the train dataset. network: the network with loss load_checkpoint_path: the file path which saved pretrain model checkpoint. save_checkpoint_path: the file path which will save finetune model checkpoint. epoch_num: the number of epoch """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size() # optimizer if cfg.optimizer == 'AdamWeightDecay': lr_schedule = GPT2LearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.AdamWeightDecay.power) params = network.trainable_params() decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == 'Lamb': lr_schedule = GPT2LearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), lr_schedule) elif cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), cfg.Momentum.learning_rate, cfg.Momentum.momentum) else: raise Exception( "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]" ) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) prefix_name = "gpt2_summarization_" + str(cfg.gpt2_network) + "_" + str(cfg.optimizer) + "_" \ + str(epoch_num) + "_bs" + str(gpt2_net_cfg.batch_size) ckpoint_cb = ModelCheckpoint( prefix=prefix_name, directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) final_param_dict = {} for name, _ in param_dict.items(): final_param_dict['gpt2.gpt2.' + name] = param_dict[name] final_param_dict['gpt2.lm_head.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(network, final_param_dict) print("Load pretrained parameter successfully!\n") update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = GPT2FinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) loss_cb = LossMonitor(per_print_times=1) model = Model(netwithgrads) callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb] print("============== Starting Finetuning ==============") model.train(epoch_num, dataset, callbacks=callbacks, dataset_sink_mode=False) print("============== Finetuning Success ==============")
def _build_training_pipeline(config: TransformerConfig, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None, platform="Ascend"): """ Build training pipeline. Args: config (TransformerConfig): Config of mass model. pre_training_dataset (Dataset): Pre-training dataset. fine_tune_dataset (Dataset): Fine-tune dataset. test_dataset (Dataset): Test dataset. """ net_with_loss = TransformerNetworkWithLoss(config, is_training=True) net_with_loss.init_parameters_data() if config.existed_ckpt: if config.existed_ckpt.endswith(".npz"): weights = np.load(config.existed_ckpt) else: weights = load_checkpoint(config.existed_ckpt) for param in net_with_loss.trainable_params(): weights_name = param.name if weights_name not in weights: raise ValueError( f"Param {weights_name} is not found in ckpt file.") if isinstance(weights[weights_name], Parameter): param.default_input = weights[weights_name].default_input elif isinstance(weights[weights_name], Tensor): param.default_input = Tensor(weights[weights_name].asnumpy(), config.dtype) elif isinstance(weights[weights_name], np.ndarray): param.default_input = Tensor(weights[weights_name], config.dtype) else: param.default_input = weights[weights_name] else: for param in net_with_loss.trainable_params(): name = param.name value = param.default_input if isinstance(value, Tensor): if name.endswith(".gamma"): param.default_input = one_weight(value.asnumpy().shape) elif name.endswith(".beta") or name.endswith(".bias"): param.default_input = zero_weight(value.asnumpy().shape) else: param.default_input = weight_variable( value.asnumpy().shape) dataset = pre_training_dataset if pre_training_dataset is not None \ else fine_tune_dataset if dataset is None: raise ValueError( "pre-training dataset or fine-tuning dataset must be provided one." ) update_steps = dataset.get_repeat_count() * dataset.get_dataset_size() if config.lr_scheduler == "isr": lr = Tensor(square_root_schedule( lr=config.lr, update_num=update_steps, decay_start_step=config.decay_start_step, warmup_steps=config.warmup_steps, min_lr=config.min_lr), dtype=mstype.float32) elif config.lr_scheduler == "poly": lr = Tensor(polynomial_decay_scheduler( lr=config.lr, min_lr=config.min_lr, decay_steps=config.decay_steps, total_update_num=update_steps, warmup_steps=config.warmup_steps, power=config.poly_lr_scheduler_power), dtype=mstype.float32) else: lr = config.lr if config.optimizer.lower() == "adam": optimizer = Adam(net_with_loss.trainable_params(), lr, beta1=0.9, beta2=0.98) elif config.optimizer.lower() == "lamb": lr = BertLearningRate(decay_steps=12000, learning_rate=config.lr, end_learning_rate=config.min_lr, power=10.0, warmup_steps=config.warmup_steps) decay_params = list( filter( lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x .name.lower(), net_with_loss.trainable_params())) other_params = list( filter( lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name. lower(), net_with_loss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }] optimizer = Lamb(group_params, lr, eps=1e-6) elif config.optimizer.lower() == "momentum": optimizer = Momentum(net_with_loss.trainable_params(), lr, momentum=0.9) else: raise ValueError(f"optimizer only support `adam` and `momentum` now.") # loss scale. if platform == "Ascend": scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale, scale_factor=config.loss_scale_factor, scale_window=config.scale_window) else: scale_manager = FixedLossScaleManager(loss_scale=1.0, drop_overflow_update=True) net_with_grads = TransformerTrainOneStepWithLossScaleCell( network=net_with_loss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) loss_monitor = LossCallBack(config) ckpt_config = CheckpointConfig( save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') callbacks = [loss_monitor] if rank_size is not None and int( rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) print(f" | ALL SET, PREPARE TO TRAIN.") _train(model=model, config=config, pre_training_dataset=pre_training_dataset, fine_tune_dataset=fine_tune_dataset, test_dataset=test_dataset, callbacks=callbacks)
def do_train(dataset=None, network=None, load_checkpoint_path="", save_checkpoint_path="", epoch_num=1): """ Do train Args: dataset: the train dataset. network: the network with loss load_checkpoint_path: the file path which saved pretrain model checkpoint. save_checkpoint_path: the file path which will save finetune model checkpoint. epoch_num: the number of epoch """ if load_checkpoint_path == "": raise ValueError( "Pretrain model missed, finetune task must load pretrain model!") steps_per_epoch = dataset.get_dataset_size( ) # samples / batch_size doing#### #Select Optimizer if cfg.optimizer == 'AdamWeightDecay': lr_schedule = GPT2LearningRate( learning_rate=cfg.AdamWeightDecay.learning_rate, end_learning_rate=cfg.AdamWeightDecay.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.AdamWeightDecay.power) params = network.trainable_params( ) # return a list of all trainable parmeters of the network # Use parameter groups and set different values decay_params = list(filter(cfg.AdamWeightDecay.decay_filter, params)) # without layernorm and bias other_params = list( filter(lambda x: not cfg.AdamWeightDecay.decay_filter(x), params)) # with layernorm and bias group_params = [{ 'params': decay_params, 'weight_decay': cfg.AdamWeightDecay.weight_decay }, { 'params': other_params, 'weight_decay': 0.0 }] optimizer = AdamWeightDecay(group_params, lr_schedule, eps=cfg.AdamWeightDecay.eps) elif cfg.optimizer == 'Lamb': lr_schedule = GPT2LearningRate( learning_rate=cfg.Lamb.learning_rate, end_learning_rate=cfg.Lamb.end_learning_rate, warmup_steps=int(steps_per_epoch * epoch_num * 0.1), decay_steps=steps_per_epoch * epoch_num, power=cfg.Lamb.power) optimizer = Lamb(network.trainable_params(), lr_schedule) elif cfg.optimizer == 'Momentum': optimizer = Momentum(network.trainable_params(), cfg.Momentum.learning_rate, cfg.Momentum.momentum) else: raise Exception( "Optimizer not supported. support: [AdamWeightDecay, Lamb, Momentum]" ) # load checkpoint into network ckpt_config = CheckpointConfig(save_checkpoint_steps=steps_per_epoch, keep_checkpoint_max=1) ckpoint_cb = ModelCheckpoint( prefix="gpt2_translation_en_fr_", directory=None if save_checkpoint_path == "" else save_checkpoint_path, config=ckpt_config) param_dict = load_checkpoint(load_checkpoint_path) final_param_dict = {} for k, v in param_dict.items(): final_param_dict['gpt2_loss.gpt2.gpt2.' + k] = param_dict[k] # set the weights of final linear weights to weights of gpt2 token embedding final_param_dict['gpt2_loss.gpt2.dense1.weight'] = param_dict[ 'gpt2_embedding_lookup.embedding_table'] load_param_into_net(network, final_param_dict) print("| loading the pretrained weights | \n") update_cell = DynamicLossScaleUpdateCell(loss_scale_value=2**32, scale_factor=2, scale_window=1000) netwithgrads = GPT2FinetuneCell(network, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) loss_cb = LossMonitor() model = Model(netwithgrads, amp_level='O2') callbacks = [TimeMonitor(dataset.get_dataset_size()), loss_cb, ckpoint_cb] print( "============== Starting Training For Translation Task ==============") model.train(epoch_num, dataset, callbacks=callbacks) print( "============== Translation Training Success ==============")
return ds if __name__ == '__main__': if args_opt.mode == 'train' and args_opt.run_distribute: context.set_auto_parallel_context( device_num=args_opt.device_num, parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) auto_parallel_context().set_all_reduce_fusion_split_indices([140]) init() epoch_size = args_opt.epoch_size net = resnet50(args_opt.num_classes) ls = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), 0.01, 0.9) model = Model(net, loss_fn=ls, optimizer=opt, metrics={'acc'}) if args_opt.mode == 'train': # train print("============== Starting Training ==============") dataset = create_dataset() batch_num = dataset.get_dataset_size() config_ck = CheckpointConfig(save_checkpoint_steps=batch_num, keep_checkpoint_max=10) checkpoint_path = args_opt.checkpoint_path if args_opt.checkpoint_path is not None else "./" ckpoint_cb = ModelCheckpoint(prefix="train_resnet_cifar10", directory=checkpoint_path, config=config_ck) loss_cb = LossMonitor() model.train(epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb])
def compile_net(net): optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) train_net = TrainOneStepCell(net, optimizer) train_net.set_auto_parallel() _executor.compile(train_net, _x, _b) context.reset_auto_parallel_context()
The sample can be run on CPU/GPU/Ascend. """ import mindspore.nn as nn from mindspore.nn import Momentum, SoftmaxCrossEntropyWithLogits from mindspore import Model, context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor from src.dataset import create_train_dataset, create_eval_dataset from src.net import Net if __name__ == "__main__": context.set_context(mode=context.GRAPH_MODE) ds_train = create_train_dataset() ds_eval = create_eval_dataset() net = Net() net_opt = Momentum(net.trainable_params(), 0.01, 0.9) net_loss = SoftmaxCrossEntropyWithLogits(reduction='mean') metrics = { 'Accuracy': nn.Accuracy(), 'Loss': nn.Loss(), 'Precision': nn.Precision(), 'Recall': nn.Recall(), 'F1_score': nn.F1() } config_ck = CheckpointConfig(save_checkpoint_steps=1000, keep_checkpoint_max=10) ckpoint = ModelCheckpoint(prefix="CKPT", config=config_ck) model = Model(network=net, loss_fn=net_loss, optimizer=net_opt, metrics=metrics)