def run_transformer_train(): """ Transformer training. """ parser = argparse_init() args, _ = parser.parse_known_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id) context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) if args.distribute == "true": device_num = args.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True, device_num=device_num) D.init() rank_id = args.device_id % device_num else: device_num = 1 rank_id = 0 dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num, rank_id=rank_id, do_shuffle=args.do_shuffle, enable_data_sink=args.enable_data_sink, dataset_path=args.data_path) netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True) if args.checkpoint_path: parameter_dict = load_checkpoint(args.checkpoint_path) load_param_into_net(netwithloss, parameter_dict) lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", training_steps=dataset.get_dataset_size()*args.epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=transformer_net_cfg.hidden_size, start_decay_step=cfg.lr_schedule.start_decay_step, min_lr=cfg.lr_schedule.min_lr), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()] if args.enable_save_ckpt == "true": if device_num == 1 or (device_num > 1 and rank_id == 0): ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, keep_checkpoint_max=args.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config) callbacks.append(ckpoint_cb) if args.enable_lossscale == "true": scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer) netwithgrads.set_train(True) model = Model(netwithgrads) model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=(args.enable_data_sink == "true"))
def test_bert_tdt(): """test bert tdt""" np.random.seed(0) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) context.set_context(enable_graph_kernel=True) ds = me_de_train_dataset() config = get_config(version='large', batch_size=16) netwithloss = BertNetworkWithLoss(config, True) optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size()*ds.get_repeat_count(), start_learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0, weight_decay=0.01) scale_window = 3 scale_manager = DynamicLossScaleManager(262144, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info("***************** BERT param name is 1 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) else: logger.info("***************** BERT param name is 2 {}".format(name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor(np.transpose(weight_value, [1, 0])) else: logger.info("***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(1, ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [12.559319, 12.333815, 12.339806, 12.350235, 12.343947, 12.830965, 12.375336, 12.973715, 12.57929, 12.7766905] error = loss_value - expect_loss_value print("loss value: {}".format(loss_value)) print("error value: {}".format(error)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [True, True, True, True, False, False, False, True, False, False] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [131072.0, 65536.0, 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def test_compile_fp16_lr_overflow_dynamic_graph(): inputs = Tensor(np.ones([16, 16]).astype(np.float32)) label = Tensor(np.zeros([16, 16]).astype(np.float32)) lr = Tensor(np.ones([1], np.float32) * 0.1) net = NetFP16(16, 16) loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9) net_with_loss = WithLossCell(net, loss) scale_manager = DynamicLossScaleManager() update_cell = scale_manager.get_update_cell() train_network = TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_update_cell=update_cell) train_network.set_train() output = train_network(inputs, label) print("the result is ", output)
def test_compile_grad_error(): inputs = Tensor(np.ones([16, 16]).astype(np.float32)) label = Tensor(np.zeros([16, 16]).astype(np.float32)) lr = Tensor(np.ones([1], np.float32) * 0.1) net = NetFP16(16, 16) loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9) net_with_loss = WithLossCell(net, loss) scale_manager = DynamicLossScaleManager() update_cell = scale_manager.get_update_cell() train_network = TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_update_cell=update_cell) train_network.set_train() with pytest.raises(TypeError) as e: train_network(inputs, label) print(e)
def loss_scale_manager_common(strategy1): learning_rate = 0.1 momentum = 0.9 epoch_size = 2 context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.AUTO_PARALLEL, device_num=8) predict = Tensor(np.ones([32, 128]), dtype=ms.float32) label = Tensor(np.ones([32]), dtype=ms.int32) dataset = Dataset(predict, label, 2) net = all_to_all_net(strategy1) loss = SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) loss.softmax_cross_entropy.set_strategy(((8, 1), (8, 1))) opt = Momentum(net.trainable_params(), learning_rate, momentum) scale_manager = DynamicLossScaleManager(32, 2, 2000) model = Model(net, loss, opt, loss_scale_manager=scale_manager) # if no GE exists, outputs = self._train_network(*next_element) outputs inputs tensor. try: model.train(epoch_size, dataset, dataset_sink_mode=False) except TypeError: pass else: assert False
def __init__(self, network, optimizer=None): super(ModelBert, self).__init__() self.optimizer = optimizer manager = DynamicLossScaleManager() update_cell = LossScaleUpdateCell(manager) self.train_network = BertTrainOneStepWithLossScaleCell(network, self.optimizer, scale_update_cell=update_cell) self.train_network.set_train()
def test_loss_scale_fp16_model_train_overflow(): dataset_types = (np.float32, np.float32) dataset_shapes = ((16, 16), (16, 16)) dataset = MindDataSet(dataset_types, dataset_shapes) net = NetFP16(16, 16) net.set_train() loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9) scale_manager = DynamicLossScaleManager(init_loss_scale=16, scale_factor=2, scale_window=2) model = Model(net, loss_fn=loss, optimizer=optimizer, metrics=None, loss_scale_manager=scale_manager) model.train(2, dataset, dataset_sink_mode=False)
def _build_training_pipeline(config: TransformerConfig, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None): """ Build training pipeline. Args: config (TransformerConfig): Config of mass model. pre_training_dataset (Dataset): Pre-training dataset. fine_tune_dataset (Dataset): Fine-tune dataset. test_dataset (Dataset): Test dataset. """ net_with_loss = TransformerNetworkWithLoss(config, is_training=True) net_with_loss.init_parameters_data() if config.existed_ckpt: if config.existed_ckpt.endswith(".npz"): weights = np.load(config.existed_ckpt) else: weights = load_checkpoint(config.existed_ckpt) for param in net_with_loss.trainable_params(): weights_name = param.name if weights_name not in weights: raise ValueError( f"Param {weights_name} is not found in ckpt file.") if isinstance(weights[weights_name], Parameter): param.default_input = weights[weights_name].default_input elif isinstance(weights[weights_name], Tensor): param.default_input = Tensor(weights[weights_name].asnumpy(), config.dtype) elif isinstance(weights[weights_name], np.ndarray): param.default_input = Tensor(weights[weights_name], config.dtype) else: param.default_input = weights[weights_name] else: for param in net_with_loss.trainable_params(): name = param.name value = param.default_input if isinstance(value, Tensor): if name.endswith(".gamma"): param.default_input = one_weight(value.asnumpy().shape) elif name.endswith(".beta") or name.endswith(".bias"): param.default_input = zero_weight(value.asnumpy().shape) else: param.default_input = weight_variable( value.asnumpy().shape) dataset = pre_training_dataset if pre_training_dataset is not None \ else fine_tune_dataset if dataset is None: raise ValueError( "pre-training dataset or fine-tuning dataset must be provided one." ) update_steps = dataset.get_repeat_count() * dataset.get_dataset_size() if config.lr_scheduler == "isr": lr = Tensor(square_root_schedule( lr=config.lr, update_num=update_steps, decay_start_step=config.decay_start_step, warmup_steps=config.warmup_steps, min_lr=config.min_lr), dtype=mstype.float32) elif config.lr_scheduler == "poly": lr = Tensor(polynomial_decay_scheduler( lr=config.lr, min_lr=config.min_lr, decay_steps=config.decay_steps, total_update_num=update_steps, warmup_steps=config.warmup_steps, power=config.poly_lr_scheduler_power), dtype=mstype.float32) else: lr = config.lr if config.optimizer.lower() == "adam": optimizer = Adam(net_with_loss.trainable_params(), lr, beta1=0.9, beta2=0.98) elif config.optimizer.lower() == "lamb": lr = BertLearningRate(decay_steps=12000, learning_rate=config.lr, end_learning_rate=config.min_lr, power=10.0, warmup_steps=config.warmup_steps) decay_params = list( filter( lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x .name.lower(), net_with_loss.trainable_params())) other_params = list( filter( lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name. lower(), net_with_loss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }] optimizer = Lamb(group_params, lr, eps=1e-6) elif config.optimizer.lower() == "momentum": optimizer = Momentum(net_with_loss.trainable_params(), lr, momentum=0.9) else: raise ValueError(f"optimizer only support `adam` and `momentum` now.") # Dynamic loss scale. scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale, scale_factor=config.loss_scale_factor, scale_window=config.scale_window) net_with_grads = TransformerTrainOneStepWithLossScaleCell( network=net_with_loss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) loss_monitor = LossCallBack(config) ckpt_config = CheckpointConfig( save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') callbacks = [loss_monitor] if rank_size is not None and int( rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) print(f" | ALL SET, PREPARE TO TRAIN.") _train(model=model, config=config, pre_training_dataset=pre_training_dataset, fine_tune_dataset=fine_tune_dataset, test_dataset=test_dataset, callbacks=callbacks)
def test_bert_percision(): """test bert percision""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) ds, new_repeat_count, _ = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = 16 config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower( ) and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower( ) or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }, { 'order_params': netwithloss.trainable_params() }] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) assert np.allclose(loss_value[0], 12.206575, 0, 0.000001) expect_loss_value = [ 12.206575, 11.865044, 11.828129, 11.826707, 11.82108, 12.407423, 12.005459, 12.621225, 12.222903, 12.427446 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [ False, False, False, True, False, False, False, True, False, False ] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def test_bert_tdt(): """test bert tdt""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) ds = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = int(os.getenv('BATCH_SIZE', '16')) config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * ds.get_repeat_count(), start_learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0, weight_decay=0.01) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(ds.get_repeat_count(), ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [ 12.207198, 11.980881, 11.984844, 11.879381, 11.832978, 12.411333, 12.009284, 12.621277, 12.223178, 12.427385 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [ True, True, False, False, False, True, False, False, False, True ] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def train(): """train""" set_seed(1) device_id = int(os.getenv('DEVICE_ID', '0')) rank_id = int(os.getenv('RANK_ID', '0')) device_num = int(os.getenv('RANK_SIZE', '1')) # context.set_context(mode=context.GRAPH_MODE, device_target="GPU", save_graphs=False, device_id=device_id) context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU", save_graphs=False, device_id=device_id) if device_num > 1: init() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, device_num=device_num, global_rank=device_id, gradients_mean=True) if args.modelArts_mode: import moxing as mox local_data_url = '/cache/data' mox.file.copy_parallel(src_url=args.data_url, dst_url=local_data_url) train_dataset = DIV2K(args, name=args.data_train, train=True, benchmark=False) train_dataset.set_scale(args.task_id) print(len(train_dataset)) train_de_dataset = ds.GeneratorDataset(train_dataset, ["LR", "HR"], num_shards=device_num, shard_id=rank_id, shuffle=True) train_de_dataset = train_de_dataset.batch(args.batch_size, drop_remainder=True) eval_dataset = SRData(args, name=args.data_test, train=False, benchmark=True) print(len(eval_dataset)) eval_ds = ds.GeneratorDataset(eval_dataset, ['LR', 'HR'], shuffle=False) eval_ds = eval_ds.batch(1, drop_remainder=True) # net_m = RCAN(args) net_m = EDSR(args) print("Init net weights successfully") if args.ckpt_path: param_dict = load_checkpoint(args.pth_path) load_param_into_net(net_m, param_dict) print("Load net weight successfully") step_size = train_de_dataset.get_dataset_size() lr = [] for i in range(0, args.epochs): cur_lr = args.lr / (2**((i + 1) // 200)) lr.extend([cur_lr] * step_size) opt = nn.Adam(net_m.trainable_params(), learning_rate=lr, loss_scale=args.loss_scale) loss = nn.L1Loss() loss_scale_manager = DynamicLossScaleManager(init_loss_scale=args.init_loss_scale, \ scale_factor=2, scale_window=1000) eval_net = net_m model = Model(net_m, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager) time_cb = TimeMonitor(data_size=step_size) loss_cb = LossMonitor() metrics = { "psnr": PSNR(rgb_range=args.rgb_range, shave=True), } eval_cb = EvalCallBack(eval_net, eval_ds, args.test_every, step_size / args.batch_size, metrics=metrics, rank_id=rank_id) cb = [time_cb, loss_cb, eval_cb] config_ck = CheckpointConfig( save_checkpoint_steps=args.ckpt_save_interval * step_size, keep_checkpoint_max=args.ckpt_save_max) ckpt_cb = ModelCheckpoint(prefix=args.filename, directory=args.ckpt_save_path, config=config_ck) if device_id == 0: cb += [ckpt_cb] model.train(args.epochs, train_de_dataset, callbacks=cb, dataset_sink_mode=True)
opt = Momentum(params=get_param_groups(net), learning_rate=Tensor(lr), momentum=cfg.momentum, weight_decay=cfg.weight_decay, loss_scale=cfg.loss_scale) if not cfg.use_label_smooth: cfg.label_smooth_factor = 0.0 loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=cfg.label_smooth_factor, num_classes=cfg.num_classes) if cfg.is_dynamic_loss_scale: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager( cfg.loss_scale, drop_overflow_update=False) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, amp_level="O3", loss_scale_manager=loss_scale_manager) config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 50, keep_checkpoint_max=cfg.keep_checkpoint_max) time_cb = TimeMonitor(data_size=batch_num)
def train_alexnet(): print(config) print('device id:', get_device_id()) print('device num:', get_device_num()) print('rank id:', get_rank_id()) print('job id:', get_job_id()) device_target = config.device_target context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) context.set_context(save_graphs=False) device_num = get_device_num() if config.dataset_name == "cifar10": if device_num > 1: config.learning_rate = config.learning_rate * device_num config.epoch_size = config.epoch_size * 2 elif config.dataset_name == "imagenet": pass else: raise ValueError("Unsupported dataset.") if device_num > 1: context.reset_auto_parallel_context() context.set_auto_parallel_context(device_num=device_num, \ parallel_mode=ParallelMode.DATA_PARALLEL, gradients_mean=True) if device_target == "Ascend": context.set_context(device_id=get_device_id()) init() elif device_target == "GPU": init() else: context.set_context(device_id=get_device_id()) if config.dataset_name == "cifar10": ds_train = create_dataset_cifar10(config.data_path, config.batch_size, target=config.device_target) elif config.dataset_name == "imagenet": ds_train = create_dataset_imagenet(config.data_path, config.batch_size) else: raise ValueError("Unsupported dataset.") if ds_train.get_dataset_size() == 0: raise ValueError( "Please check dataset size > 0 and batch_size <= dataset size") network = AlexNet(config.num_classes, phase='train') loss_scale_manager = None metrics = None step_per_epoch = ds_train.get_dataset_size( ) if config.sink_size == -1 else config.sink_size if config.dataset_name == 'cifar10': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") lr = Tensor( get_lr_cifar10(0, config.learning_rate, config.epoch_size, step_per_epoch)) opt = nn.Momentum(network.trainable_params(), lr, config.momentum) metrics = {"Accuracy": Accuracy()} elif config.dataset_name == 'imagenet': loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") lr = Tensor( get_lr_imagenet(config.learning_rate, config.epoch_size, step_per_epoch)) opt = nn.Momentum(params=get_param_groups(network), learning_rate=lr, momentum=config.momentum, weight_decay=config.weight_decay, loss_scale=config.loss_scale) from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager if config.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager( config.loss_scale, drop_overflow_update=False) else: raise ValueError("Unsupported dataset.") if device_target == "Ascend": model = Model(network, loss_fn=loss, optimizer=opt, metrics=metrics, amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=loss_scale_manager) elif device_target == "GPU": model = Model(network, loss_fn=loss, optimizer=opt, metrics=metrics, loss_scale_manager=loss_scale_manager) else: raise ValueError("Unsupported platform.") if device_num > 1: ckpt_save_dir = os.path.join(config.checkpoint_path + "_" + str(get_rank())) else: ckpt_save_dir = config.checkpoint_path time_cb = TimeMonitor(data_size=step_per_epoch) config_ck = CheckpointConfig( save_checkpoint_steps=config.save_checkpoint_steps, keep_checkpoint_max=config.keep_checkpoint_max) ckpoint_cb = ModelCheckpoint(prefix="checkpoint_alexnet", directory=ckpt_save_dir, config=config_ck) print("============== Starting Training ==============") model.train(config.epoch_size, ds_train, callbacks=[time_cb, ckpoint_cb, LossMonitor()], dataset_sink_mode=config.dataset_sink_mode, sink_size=config.sink_size)
def test_bert_performance(): """test bert performance""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) version = os.getenv('VERSION', 'large') config = get_config(version=version) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=sink_size * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower( ) and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower( ) or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }, { 'order_params': netwithloss.trainable_params() }] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.data name = param.name if isinstance(value, Tensor) and not value.has_init: if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.set_data(weight_variable(value.asnumpy().shape)) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.set_data(Tensor(np.transpose(weight_value, [1, 0]))) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) time_monitor_callback = TimeMonitor(sink_size) model.train(new_repeat_count, data_set, callbacks=[time_monitor_callback, callback], dataset_sink_mode=True, sink_size=sink_size) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [11.3660, 11.3265, 11.3264] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [True, True, True] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [65536.0, 65536.0, 65536.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0) epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2] expect_epoch_mseconds = 1400 print("epoch mseconds: {}".format(epoch_mseconds)) assert epoch_mseconds <= expect_epoch_mseconds + 5 per_step_mseconds = np.array( time_monitor_callback.per_step_mseconds_list)[2] expect_per_step_mseconds = 14 print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 1
def _build_training_pipeline(config: GNMTConfig, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None): """ Build training pipeline. Args: config (GNMTConfig): Config of mass model. pre_training_dataset (Dataset): Pre-training dataset. fine_tune_dataset (Dataset): Fine-tune dataset. test_dataset (Dataset): Test dataset. """ net_with_loss = GNMTNetworkWithLoss(config, is_training=True, use_one_hot_embeddings=True) net_with_loss.init_parameters_data() _load_checkpoint_to_net(config, net_with_loss) dataset = pre_training_dataset if pre_training_dataset is not None \ else fine_tune_dataset if dataset is None: raise ValueError( "pre-training dataset or fine-tuning dataset must be provided one." ) update_steps = config.epochs * dataset.get_dataset_size() lr = _get_lr(config, update_steps) optimizer = _get_optimizer(config, net_with_loss, lr) # Dynamic loss scale. scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale, scale_factor=config.loss_scale_factor, scale_window=config.scale_window) net_with_grads = GNMTTrainOneStepWithLossScaleCell( network=net_with_loss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) loss_monitor = LossCallBack(config) dataset_size = dataset.get_dataset_size() time_cb = TimeMonitor(data_size=dataset_size) ckpt_config = CheckpointConfig( save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') callbacks = [time_cb, loss_monitor] if rank_size is not None and int( rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50) callbacks.append(summary_callback) if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50) callbacks.append(summary_callback) print(f" | ALL SET, PREPARE TO TRAIN.") _train(model=model, config=config, pre_training_dataset=pre_training_dataset, fine_tune_dataset=fine_tune_dataset, test_dataset=test_dataset, callbacks=callbacks)
print("dataset file {} not exists, please check!".format( mindrecord_file)) raise ValueError(mindrecord_file) dataset = create_gru_dataset(epoch_count=config.num_epochs, batch_size=config.batch_size, dataset_path=mindrecord_file, rank_size=device_num, rank_id=rank) dataset_size = dataset.get_dataset_size() print("dataset size is {}".format(dataset_size)) network = Seq2Seq(config) network = GRUWithLossCell(network) lr = dynamic_lr(config, dataset_size) opt = Adam(network.trainable_params(), learning_rate=lr) scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale_value, scale_factor=config.scale_factor, scale_window=config.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = GRUTrainOneStepWithLossScaleCell(network, opt, update_cell) time_cb = TimeMonitor(data_size=dataset_size) loss_cb = LossCallBack(rank_id=rank) cb = [time_cb, loss_cb] #Save Checkpoint if config.save_checkpoint: ckpt_config = CheckpointConfig( save_checkpoint_steps=config.ckpt_epoch * dataset_size, keep_checkpoint_max=config.keep_checkpoint_max) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank_id) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config,
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.device_target, save_graphs=False) if args.device_target == 'Ascend': devid = int(os.getenv('DEVICE_ID')) context.set_context(device_id=devid) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatible for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) if args.net == "densenet100": from src.network.densenet import DenseNet100 as DenseNet else: from src.network.densenet import DenseNet121 as DenseNet if args.dataset == "cifar10": from src.datasets import classification_dataset_cifar10 as classification_dataset else: from src.datasets import classification_dataset_imagenet as classification_dataset # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, args.max_epoch, args.rank, args.group_size) de_dataset.map_model = 4 args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = DenseNet(args.num_classes) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 criterion = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) # lr scheduler lr_scheduler = get_lr_scheduler(args) lr_schedule = lr_scheduler.get_lr() # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr_schedule), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # mixed precision training criterion.add_flags_recursive(fp32=True) # package training process, adjust lr + forward + backward + optimizer train_net = BuildTrainNetwork(network, criterion) if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, gradients_mean=True) if args.device_target == 'Ascend': model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O3") elif args.device_target == 'GPU': model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O0") elif args.device_target == 'CPU': model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager, amp_level="O0") else: raise ValueError("Unsupported device target.") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [progress_cb,] if args.rank_save_ckpt_flag: ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks)
if not config.use_label_smooth: config.label_smooth_factor = 0.0 loss = CrossEntropySmooth(sparse=True, reduction="mean", smooth_factor=config.label_smooth_factor, num_classes=config.class_num) else: loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean") if (args_opt.net == "resnet101" or args_opt.net == "resnet50") and \ not args_opt.parameter_server and target != "CPU": opt = Momentum( filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) from mindspore.train.loss_scale_manager import DynamicLossScaleManager loss_scale = DynamicLossScaleManager() model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'}, amp_level=amp_level, keep_batchnorm_fp32=False) else: ## fp32 training opt = Momentum( filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay) model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}) # define callbacks
args.logger.info('load model {} success'.format(args.pretrained)) else: init_net(args, network_1) train_net = BuildTrainNetwork(network_1, criterion_1, args) args.logger.info('args:{}'.format(args)) # call warmup_step should behind the args steps_per_epoch args.lrs = warmup_step_list(args, gamma=0.1) lrs_gen = list_to_gen(args.lrs) opt = Momentum(params=train_net.trainable_params(), learning_rate=lrs_gen, momentum=momentum, weight_decay=weight_decay) scale_manager = DynamicLossScaleManager( init_loss_scale=args.dynamic_init_loss_scale, scale_factor=2, scale_window=2000) model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=scale_manager) save_checkpoint_steps = args.ckpt_steps args.logger.info('save_checkpoint_steps:{}'.format(save_checkpoint_steps)) if args.max_ckpts == -1: keep_checkpoint_max = int(args.steps_per_epoch * args.max_epoch / save_checkpoint_steps) + 5 # for more than 5 else: keep_checkpoint_max = args.max_ckpts args.logger.info('keep_checkpoint_max:{}'.format(keep_checkpoint_max)) ckpt_config = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps,
def test_bert_tdt(): """test bert tdt""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) ds = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = int(os.getenv('BATCH_SIZE', '16')) config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) optimizer = Momentum(netwithloss.trainable_params(), learning_rate=2e-5, momentum=0.9) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(ds.get_repeat_count(), ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [ 12.1918125, 11.966035, 11.972114, 11.982188, 11.974092, 12.610916, 12.17565, 12.840416, 12.40291, 12.621661 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0.00001, 0.00001) overflow = np.array(callback.overflow_list) expect_overflow = [ True, True, False, False, False, True, False, False, False, True ] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0.00001, 0.00001)
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) # init distributed if args.is_distributed: init() args.rank = get_rank() args.group_size = get_group_size() if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, args.max_epoch, args.rank, args.group_size) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, args.num_classes) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) network.add_flags_recursive(fp16=True) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 criterion = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) # lr scheduler if args.lr_scheduler == 'exponential': lr = warmup_step_lr(args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, gamma=args.lr_gamma, ) elif args.lr_scheduler == 'cosine_annealing': lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) else: raise NotImplementedError(args.lr_scheduler) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) criterion.add_flags_recursive(fp32=True) # package training process, adjust lr + forward + backward + optimizer train_net = BuildTrainNetwork(network, criterion) if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) # Model api changed since TR5_branch 2020/03/09 context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, parameter_broadcast=True, mirror_mean=True) model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=loss_scale_manager) # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [progress_cb,] if args.rank_save_ckpt_flag: ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval ckpt_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: if args.platform == "Ascend": init() else: init("nccl") args.rank = get_rank() args.group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, parameter_broadcast=True, mirror_mean=True) else: args.rank = 0 args.group_size = 1 if args.is_dynamic_loss_scale == 1: args.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt # select for master rank save ckpt or all rank save, compatiable for model parallel args.rank_save_ckpt_flag = 0 if args.is_save_on_master: if args.rank == 0: args.rank_save_ckpt_flag = 1 else: args.rank_save_ckpt_flag = 1 # logger args.outputs_dir = os.path.join( args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) args.logger = get_logger(args.outputs_dir, args.rank) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, 1, args.rank, args.group_size, num_parallel_workers=8) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) # lr scheduler if args.lr_scheduler == 'exponential': lr = warmup_step_lr( args.lr, args.lr_epochs, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, gamma=args.lr_gamma, ) elif args.lr_scheduler == 'cosine_annealing': lr = warmup_cosine_annealing_lr(args.lr, args.steps_per_epoch, args.warmup_epochs, args.max_epoch, args.T_max, args.eta_min) else: raise NotImplementedError(args.lr_scheduler) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) if args.platform == "Ascend": model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O3") else: auto_mixed_precision(network) model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}) # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [ progress_cb, ] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=args.outputs_dir, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
def test_transformer(): """ Transformer training. """ np.random.seed(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) version = os.getenv('VERSION', 'large') batch_size = 96 epoch_size = 3 config = get_config(version=version, batch_size=batch_size) dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=DATA_DIR) netwithloss = TransformerNetworkWithLoss(config, True) lr = Tensor( create_dynamic_lr( schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", training_steps=dataset.get_dataset_size() * epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=config.hidden_size), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) callback = ModelCallback() scale_manager = DynamicLossScaleManager(init_loss_scale=4194304, scale_factor=cfg.scale_factor, scale_window=3) update_cell = scale_manager.get_update_cell() netwithgrads = TransformerTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) time_monitor_callback = TimeMonitor(dataset.get_dataset_size()) model = Model(netwithgrads) model.train(epoch_size, dataset, callbacks=[time_monitor_callback, callback], dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) assert np.allclose(loss_value[0], 11.241606, 0, 0.000005) expect_loss_value = [ 11.241606, 11.243232, 11.217459, 11.204157, 11.213804, 11.215373, 11.190564, 11.150393, 11.191823, 11.160045 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value[0:10], expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [ False, False, False, True, False, False, False, True, False, False ] print("overflow: {}".format(overflow)) assert (overflow[0:10] == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 4194304.0, 4194304.0, 8388608.0, 4194304.0, 4194304.0, 4194304.0, 8388608.0, 4194304.0, 4194304.0, 4194304.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale[0:10], expect_loss_scale, 0, 0) epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2] expect_epoch_mseconds = 2400 print("epoch mseconds: {}".format(epoch_mseconds)) assert epoch_mseconds <= expect_epoch_mseconds + 20 per_step_mseconds = np.array( time_monitor_callback.per_step_mseconds_list)[2] expect_per_step_mseconds = 240 print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 2
def test_bert_performance(): """test bert performance""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) ds, new_repeat_count = me_de_train_dataset(sink_mode=True) version = os.getenv('VERSION', 'large') batch_size = 16 config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count, start_learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0, weight_decay=0.01) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) time_monitor_callback = TimeMonitor(ds.get_dataset_size()) model.train(new_repeat_count, ds, callbacks=[time_monitor_callback, callback], dataset_sink_mode=True) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [10.235566, 10.207392, 10.206976] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [True, True, True] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [262144.0, 262144.0, 262144.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0) epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2] expect_epoch_mseconds = 1600 print("epoch mseconds: {}".format(epoch_mseconds)) assert epoch_mseconds <= expect_epoch_mseconds + 5 per_step_mseconds = np.array( time_monitor_callback.per_step_mseconds_list)[2] expect_per_step_mseconds = 16 print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 1
def test_bert_precision(enable_graph_kernel=False): """test bert precision""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) if enable_graph_kernel: context.set_context(enable_graph_kernel=True) data_set, new_repeat_count, _ = me_de_train_dataset() version = os.getenv('VERSION', 'large') config = get_config(version=version) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{'params': decay_params, 'weight_decay': 0.01}, {'params': other_params}, {'order_params': netwithloss.trainable_params()}] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2 ** 16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.data name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info("***************** BERT param name is 1 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) else: logger.info("***************** BERT param name is 2 {}".format(name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.set_data(Tensor(np.transpose(weight_value, [1, 0]))) else: logger.info("***************** BERT param name is 3 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) if enable_graph_kernel: expect_loss_value = [12.206627, 11.840489, 11.798470, 11.796345, 11.790964, 12.366766, 11.971539, 12.576565, 12.185522, 12.386192] else: assert np.allclose(loss_value[0], 12.2066, 0, 0.0005) expect_loss_value = [12.206587, 11.966410, 11.965916, 11.975922, 11.970262, 12.608881, 12.174048, 12.840656, 12.407923, 12.631133] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [False, False, False, True, False, False, False, True, False, False] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def train(args): '''train''' print('=============yolov3 start trainging==================') # init distributed if args.world_size != 1: init() args.local_rank = get_rank() args.world_size = get_group_size() args.batch_size = config.batch_size args.warmup_lr = config.warmup_lr args.lr_rates = config.lr_rates args.lr_steps = config.lr_steps args.gamma = config.gamma args.weight_decay = config.weight_decay args.momentum = config.momentum args.max_epoch = config.max_epoch args.log_interval = config.log_interval args.ckpt_path = config.ckpt_path args.ckpt_interval = config.ckpt_interval args.outputs_dir = os.path.join(args.ckpt_path, datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S')) print('args.outputs_dir', args.outputs_dir) args.logger = get_logger(args.outputs_dir, args.local_rank) if args.world_size != 8: args.lr_steps = [i * 8 // args.world_size for i in args.lr_steps] if args.world_size == 1: args.weight_decay = 0. if args.world_size != 1: parallel_mode = ParallelMode.DATA_PARALLEL else: parallel_mode = ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.world_size, gradients_mean=True) mindrecord_path = args.mindrecord_path num_classes = config.num_classes anchors = config.anchors anchors_mask = config.anchors_mask num_anchors_list = [len(x) for x in anchors_mask] momentum = args.momentum args.logger.info('train opt momentum:{}'.format(momentum)) weight_decay = args.weight_decay * float(args.batch_size) args.logger.info('real weight_decay:{}'.format(weight_decay)) lr_scale = args.world_size / 8 args.logger.info('lr_scale:{}'.format(lr_scale)) # dataloader args.logger.info('start create dataloader') epoch = args.max_epoch ds = de.MindDataset(mindrecord_path + "0", columns_list=["image", "annotation"], num_shards=args.world_size, shard_id=args.local_rank) ds = ds.map(input_columns=["image", "annotation"], output_columns=["image", "annotation", 'coord_mask_0', 'conf_pos_mask_0', 'conf_neg_mask_0', 'cls_mask_0', 't_coord_0', 't_conf_0', 't_cls_0', 'gt_list_0', 'coord_mask_1', 'conf_pos_mask_1', 'conf_neg_mask_1', 'cls_mask_1', 't_coord_1', 't_conf_1', 't_cls_1', 'gt_list_1', 'coord_mask_2', 'conf_pos_mask_2', 'conf_neg_mask_2', 'cls_mask_2', 't_coord_2', 't_conf_2', 't_cls_2', 'gt_list_2'], column_order=["image", "annotation", 'coord_mask_0', 'conf_pos_mask_0', 'conf_neg_mask_0', 'cls_mask_0', 't_coord_0', 't_conf_0', 't_cls_0', 'gt_list_0', 'coord_mask_1', 'conf_pos_mask_1', 'conf_neg_mask_1', 'cls_mask_1', 't_coord_1', 't_conf_1', 't_cls_1', 'gt_list_1', 'coord_mask_2', 'conf_pos_mask_2', 'conf_neg_mask_2', 'cls_mask_2', 't_coord_2', 't_conf_2', 't_cls_2', 'gt_list_2'], operations=compose_map_func, num_parallel_workers=16, python_multiprocessing=True) ds = ds.batch(args.batch_size, drop_remainder=True, num_parallel_workers=8) args.steps_per_epoch = ds.get_dataset_size() lr = warmup_step_new(args, lr_scale=lr_scale) ds = ds.repeat(epoch) args.logger.info('args.steps_per_epoch:{}'.format(args.steps_per_epoch)) args.logger.info('args.world_size:{}'.format(args.world_size)) args.logger.info('args.local_rank:{}'.format(args.local_rank)) args.logger.info('end create dataloader') args.logger.save_args(args) args.logger.important_info('start create network') create_network_start = time.time() # backbone and loss network = backbone_HwYolov3(num_classes, num_anchors_list, args) criterion0 = YoloLoss(num_classes, anchors, anchors_mask[0], 64, 0, head_idx=0.0) criterion1 = YoloLoss(num_classes, anchors, anchors_mask[1], 32, 0, head_idx=1.0) criterion2 = YoloLoss(num_classes, anchors, anchors_mask[2], 16, 0, head_idx=2.0) # load pretrain model if os.path.isfile(args.pretrained): param_dict = load_checkpoint(args.pretrained) param_dict_new = {} for key, values in param_dict.items(): if key.startswith('moments.'): continue elif key.startswith('network.'): param_dict_new[key[8:]] = values else: param_dict_new[key] = values load_param_into_net(network, param_dict_new) args.logger.info('load model {} success'.format(args.pretrained)) train_net = BuildTrainNetworkV2(network, criterion0, criterion1, criterion2, args) # optimizer opt = Momentum(params=train_net.trainable_params(), learning_rate=Tensor(lr), momentum=momentum, weight_decay=weight_decay) # package training process train_net = TrainOneStepWithLossScaleCell(train_net, opt) train_net.set_broadcast_flag() # checkpoint ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval train_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=train_config, directory=args.outputs_dir, prefix='{}'.format(args.local_rank)) cb_params = _InternalCallbackParam() cb_params.train_network = train_net cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) train_net.set_train() t_end = time.time() t_epoch = time.time() old_progress = -1 i = 0 scale_manager = DynamicLossScaleManager(init_loss_scale=2 ** 10, scale_factor=2, scale_window=2000) for data in ds.create_tuple_iterator(output_numpy=True): batch_images = data[0] batch_labels = data[1] coord_mask_0 = data[2] conf_pos_mask_0 = data[3] conf_neg_mask_0 = data[4] cls_mask_0 = data[5] t_coord_0 = data[6] t_conf_0 = data[7] t_cls_0 = data[8] gt_list_0 = data[9] coord_mask_1 = data[10] conf_pos_mask_1 = data[11] conf_neg_mask_1 = data[12] cls_mask_1 = data[13] t_coord_1 = data[14] t_conf_1 = data[15] t_cls_1 = data[16] gt_list_1 = data[17] coord_mask_2 = data[18] conf_pos_mask_2 = data[19] conf_neg_mask_2 = data[20] cls_mask_2 = data[21] t_coord_2 = data[22] t_conf_2 = data[23] t_cls_2 = data[24] gt_list_2 = data[25] img_tensor = Tensor(batch_images, mstype.float32) coord_mask_tensor_0 = Tensor(coord_mask_0.astype(np.float32)) conf_pos_mask_tensor_0 = Tensor(conf_pos_mask_0.astype(np.float32)) conf_neg_mask_tensor_0 = Tensor(conf_neg_mask_0.astype(np.float32)) cls_mask_tensor_0 = Tensor(cls_mask_0.astype(np.float32)) t_coord_tensor_0 = Tensor(t_coord_0.astype(np.float32)) t_conf_tensor_0 = Tensor(t_conf_0.astype(np.float32)) t_cls_tensor_0 = Tensor(t_cls_0.astype(np.float32)) gt_list_tensor_0 = Tensor(gt_list_0.astype(np.float32)) coord_mask_tensor_1 = Tensor(coord_mask_1.astype(np.float32)) conf_pos_mask_tensor_1 = Tensor(conf_pos_mask_1.astype(np.float32)) conf_neg_mask_tensor_1 = Tensor(conf_neg_mask_1.astype(np.float32)) cls_mask_tensor_1 = Tensor(cls_mask_1.astype(np.float32)) t_coord_tensor_1 = Tensor(t_coord_1.astype(np.float32)) t_conf_tensor_1 = Tensor(t_conf_1.astype(np.float32)) t_cls_tensor_1 = Tensor(t_cls_1.astype(np.float32)) gt_list_tensor_1 = Tensor(gt_list_1.astype(np.float32)) coord_mask_tensor_2 = Tensor(coord_mask_2.astype(np.float32)) conf_pos_mask_tensor_2 = Tensor(conf_pos_mask_2.astype(np.float32)) conf_neg_mask_tensor_2 = Tensor(conf_neg_mask_2.astype(np.float32)) cls_mask_tensor_2 = Tensor(cls_mask_2.astype(np.float32)) t_coord_tensor_2 = Tensor(t_coord_2.astype(np.float32)) t_conf_tensor_2 = Tensor(t_conf_2.astype(np.float32)) t_cls_tensor_2 = Tensor(t_cls_2.astype(np.float32)) gt_list_tensor_2 = Tensor(gt_list_2.astype(np.float32)) scaling_sens = Tensor(scale_manager.get_loss_scale(), dtype=mstype.float32) loss0, overflow, _ = train_net(img_tensor, coord_mask_tensor_0, conf_pos_mask_tensor_0, conf_neg_mask_tensor_0, cls_mask_tensor_0, t_coord_tensor_0, t_conf_tensor_0, t_cls_tensor_0, gt_list_tensor_0, coord_mask_tensor_1, conf_pos_mask_tensor_1, conf_neg_mask_tensor_1, cls_mask_tensor_1, t_coord_tensor_1, t_conf_tensor_1, t_cls_tensor_1, gt_list_tensor_1, coord_mask_tensor_2, conf_pos_mask_tensor_2, conf_neg_mask_tensor_2, cls_mask_tensor_2, t_coord_tensor_2, t_conf_tensor_2, t_cls_tensor_2, gt_list_tensor_2, scaling_sens) overflow = np.all(overflow.asnumpy()) if overflow: scale_manager.update_loss_scale(overflow) else: scale_manager.update_loss_scale(False) args.logger.info('rank[{}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, ' 'batch_labels:{}'.format(args.local_rank, i, loss0, overflow, scaling_sens, lr[i], batch_images.shape, batch_labels.shape)) # save ckpt cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 if args.local_rank == 0: ckpt_cb.step_end(run_context) # save Log if i == 0: time_for_graph_compile = time.time() - create_network_start args.logger.important_info('Yolov3, graph compile time={:.2f}s'.format(time_for_graph_compile)) if i % args.steps_per_epoch == 0: cb_params.cur_epoch_num += 1 if i % args.log_interval == 0 and args.local_rank == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.batch_size * (i - old_progress) * args.world_size / time_used args.logger.info('epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format(epoch, i, loss0, fps)) t_end = time.time() old_progress = i if i % args.steps_per_epoch == 0 and args.local_rank == 0: epoch_time_used = time.time() - t_epoch epoch = int(i / args.steps_per_epoch) fps = args.batch_size * args.world_size * args.steps_per_epoch / epoch_time_used args.logger.info('=================================================') args.logger.info('epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format(epoch, i, fps)) args.logger.info('=================================================') t_epoch = time.time() i = i + 1 args.logger.info('=============yolov3 training finished==================')
def run_train(): '''run train function.''' config.local_rank = get_rank_id() config.world_size = get_device_num() log_path = os.path.join(config.ckpt_path, 'logs') config.logger = get_logger(log_path, config.local_rank) support_train_stage = ['base', 'beta'] if config.train_stage.lower() not in support_train_stage: config.logger.info('your train stage is not support.') raise ValueError('train stage not support.') if not os.path.exists(config.data_dir): config.logger.info( 'ERROR, data_dir is not exists, please set data_dir in config.py') raise ValueError( 'ERROR, data_dir is not exists, please set data_dir in config.py') parallel_mode = ParallelMode.HYBRID_PARALLEL if config.is_distributed else ParallelMode.STAND_ALONE context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.world_size, gradients_mean=True) if config.is_distributed: init() if config.local_rank % 8 == 0: if not os.path.exists(config.ckpt_path): os.makedirs(config.ckpt_path) de_dataset, steps_per_epoch, num_classes = get_de_dataset(config) config.logger.info('de_dataset: %d', de_dataset.get_dataset_size()) config.steps_per_epoch = steps_per_epoch config.num_classes = num_classes config.lr_epochs = list(map(int, config.lr_epochs.split(','))) config.logger.info('config.num_classes: %d', config.num_classes) config.logger.info('config.world_size: %d', config.world_size) config.logger.info('config.local_rank: %d', config.local_rank) config.logger.info('config.lr: %f', config.lr) if config.nc_16 == 1: if config.model_parallel == 0: if config.num_classes % 16 == 0: config.logger.info('data parallel aleardy 16, nums: %d', config.num_classes) else: config.num_classes = (config.num_classes // 16 + 1) * 16 else: if config.num_classes % (config.world_size * 16) == 0: config.logger.info('model parallel aleardy 16, nums: %d', config.num_classes) else: config.num_classes = (config.num_classes // (config.world_size * 16) + 1) * config.world_size * 16 config.logger.info('for D, loaded, class nums: %d', config.num_classes) config.logger.info('steps_per_epoch: %d', config.steps_per_epoch) config.logger.info('img_total_num: %d', config.steps_per_epoch * config.per_batch_size) config.logger.info('get_backbone----in----') _backbone = get_backbone(config) config.logger.info('get_backbone----out----') config.logger.info('get_metric_fc----in----') margin_fc_1 = get_metric_fc(config) config.logger.info('get_metric_fc----out----') config.logger.info('DistributedHelper----in----') network_1 = DistributedHelper(_backbone, margin_fc_1) config.logger.info('DistributedHelper----out----') config.logger.info('network fp16----in----') if config.fp16 == 1: network_1.add_flags_recursive(fp16=True) config.logger.info('network fp16----out----') criterion_1 = get_loss(config) if config.fp16 == 1 and config.model_parallel == 0: criterion_1.add_flags_recursive(fp32=True) network_1 = load_pretrain(config, network_1) train_net = BuildTrainNetwork(network_1, criterion_1, config) # call warmup_step should behind the config steps_per_epoch config.lrs = warmup_step_list(config, gamma=0.1) lrs_gen = list_to_gen(config.lrs) opt = Momentum(params=train_net.trainable_params(), learning_rate=lrs_gen, momentum=config.momentum, weight_decay=config.weight_decay) scale_manager = DynamicLossScaleManager( init_loss_scale=config.dynamic_init_loss_scale, scale_factor=2, scale_window=2000) model = Model(train_net, optimizer=opt, metrics=None, loss_scale_manager=scale_manager) save_checkpoint_steps = config.ckpt_steps config.logger.info('save_checkpoint_steps: %d', save_checkpoint_steps) if config.max_ckpts == -1: keep_checkpoint_max = int(config.steps_per_epoch * config.max_epoch / save_checkpoint_steps) + 5 else: keep_checkpoint_max = config.max_ckpts config.logger.info('keep_checkpoint_max: %d', keep_checkpoint_max) ckpt_config = CheckpointConfig(save_checkpoint_steps=save_checkpoint_steps, keep_checkpoint_max=keep_checkpoint_max) config.logger.info('max_epoch_train: %d', config.max_epoch) ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=config.ckpt_path, prefix='{}'.format(config.local_rank)) config.epoch_cnt = 0 progress_cb = ProgressMonitor(config) new_epoch_train = config.max_epoch * steps_per_epoch // config.log_interval model.train(new_epoch_train, de_dataset, callbacks=[progress_cb, ckpt_cb], sink_size=config.log_interval)
def train(args): '''train''' print('=============yolov3 start trainging==================') devid = int(os.getenv('DEVICE_ID', '0')) if args.run_platform != 'CPU' else 0 context.set_context(mode=context.GRAPH_MODE, device_target=args.run_platform, save_graphs=False, device_id=devid) # init distributed if args.world_size != 1: init() args.local_rank = get_rank() args.world_size = get_group_size() context.set_auto_parallel_context( parallel_mode=ParallelMode.DATA_PARALLEL, device_num=args.world_size, gradients_mean=True) args.logger = get_logger(args.outputs_dir, args.local_rank) # dataloader ds = create_dataset(args) args.logger.important_info('start create network') create_network_start = time.time() train_net = define_network(args) # checkpoint ckpt_max_num = args.max_epoch * args.steps_per_epoch // args.ckpt_interval train_config = CheckpointConfig(save_checkpoint_steps=args.ckpt_interval, keep_checkpoint_max=ckpt_max_num) ckpt_cb = ModelCheckpoint(config=train_config, directory=args.outputs_dir, prefix='{}'.format(args.local_rank)) cb_params = _InternalCallbackParam() cb_params.train_network = train_net cb_params.epoch_num = ckpt_max_num cb_params.cur_epoch_num = 1 run_context = RunContext(cb_params) ckpt_cb.begin(run_context) train_net.set_train() t_end = time.time() t_epoch = time.time() old_progress = -1 i = 0 if args.use_loss_scale: scale_manager = DynamicLossScaleManager(init_loss_scale=2**10, scale_factor=2, scale_window=2000) for data in ds.create_tuple_iterator(output_numpy=True): batch_images = data[0] batch_labels = data[1] input_list = [Tensor(batch_images, mstype.float32)] for idx in range(2, 26): input_list.append(Tensor(data[idx], mstype.float32)) if args.use_loss_scale: scaling_sens = Tensor(scale_manager.get_loss_scale(), dtype=mstype.float32) loss0, overflow, _ = train_net(*input_list, scaling_sens) overflow = np.all(overflow.asnumpy()) if overflow: scale_manager.update_loss_scale(overflow) else: scale_manager.update_loss_scale(False) args.logger.info( 'rank[{}], iter[{}], loss[{}], overflow:{}, loss_scale:{}, lr:{}, batch_images:{}, ' 'batch_labels:{}'.format(args.local_rank, i, loss0, overflow, scaling_sens, args.lr[i], batch_images.shape, batch_labels.shape)) else: loss0 = train_net(*input_list) args.logger.info( 'rank[{}], iter[{}], loss[{}], lr:{}, batch_images:{}, ' 'batch_labels:{}'.format(args.local_rank, i, loss0, args.lr[i], batch_images.shape, batch_labels.shape)) # save ckpt cb_params.cur_step_num = i + 1 # current step number cb_params.batch_num = i + 2 if args.local_rank == 0: ckpt_cb.step_end(run_context) # save Log if i == 0: time_for_graph_compile = time.time() - create_network_start args.logger.important_info( 'Yolov3, graph compile time={:.2f}s'.format( time_for_graph_compile)) if i % args.steps_per_epoch == 0: cb_params.cur_epoch_num += 1 if i % args.log_interval == 0 and args.local_rank == 0: time_used = time.time() - t_end epoch = int(i / args.steps_per_epoch) fps = args.batch_size * ( i - old_progress) * args.world_size / time_used args.logger.info( 'epoch[{}], iter[{}], loss:[{}], {:.2f} imgs/sec'.format( epoch, i, loss0, fps)) t_end = time.time() old_progress = i if i % args.steps_per_epoch == 0 and args.local_rank == 0: epoch_time_used = time.time() - t_epoch epoch = int(i / args.steps_per_epoch) fps = args.batch_size * args.world_size * args.steps_per_epoch / epoch_time_used args.logger.info( '=================================================') args.logger.info( 'epoch time: epoch[{}], iter[{}], {:.2f} imgs/sec'.format( epoch, i, fps)) args.logger.info( '=================================================') t_epoch = time.time() i = i + 1 args.logger.info('=============yolov3 training finished==================')
def train(cloud_args=None): """training process""" args = parse_args(cloud_args) context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True, device_target=args.platform, save_graphs=False) if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=int(os.getenv('DEVICE_ID'))) # init distributed if args.is_distributed: parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=args.group_size, gradients_mean=True) # dataloader de_dataset = classification_dataset(args.data_dir, args.image_size, args.per_batch_size, 1, args.rank, args.group_size, num_parallel_workers=8) de_dataset.map_model = 4 # !!!important args.steps_per_epoch = de_dataset.get_dataset_size() args.logger.save_args(args) # network args.logger.important_info('start create network') # get network and init network = get_network(args.backbone, num_classes=args.num_classes, platform=args.platform) if network is None: raise NotImplementedError('not implement {}'.format(args.backbone)) load_pretrain_model(args.pretrained, network, args) # lr scheduler lr = get_lr(args) # optimizer opt = Momentum(params=get_param_groups(network), learning_rate=Tensor(lr), momentum=args.momentum, weight_decay=args.weight_decay, loss_scale=args.loss_scale) # loss if not args.label_smooth: args.label_smooth_factor = 0.0 loss = CrossEntropy(smooth_factor=args.label_smooth_factor, num_classes=args.num_classes) if args.is_dynamic_loss_scale == 1: loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000) else: loss_scale_manager = FixedLossScaleManager(args.loss_scale, drop_overflow_update=False) model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager, metrics={'acc'}, amp_level="O3") # checkpoint save progress_cb = ProgressMonitor(args) callbacks = [ progress_cb, ] if args.rank_save_ckpt_flag: ckpt_config = CheckpointConfig( save_checkpoint_steps=args.ckpt_interval * args.steps_per_epoch, keep_checkpoint_max=args.ckpt_save_max) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank)) callbacks.append(ckpt_cb) model.train(args.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
def test_bert_tdt(): """test bert tdt""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) context.set_context(enable_task_sink=True) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) ds = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = int(os.getenv('BATCH_SIZE', '16')) config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) optimizer = Momentum(netwithloss.trainable_params(), learning_rate=2e-5, momentum=0.9) scale_window = 3 scale_manager = DynamicLossScaleManager(2**32, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(ds.get_repeat_count(), ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss_scale value is wrong count = 0 for i in range(len(callback.overflow_list)): if callback.overflow_list[i] == Tensor(True, mstype.bool_) and i > 0: count = 0 assert callback.lossscale_list[i] == callback.lossscale_list[ i - 1] * Tensor(0.5, mstype.float32) if callback.overflow_list[i] == Tensor(False, mstype.bool_): count = count + 1 if count == scale_window: count = 0 assert callback.lossscale_list[i] == callback.lossscale_list[ i - 1] * Tensor(2.0, mstype.float32)