def run_transformer_train(): """ Transformer training. """ parser = argparse_init() args, _ = parser.parse_known_args() context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", device_id=args.device_id) context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) if args.distribute == "true": device_num = args.device_num context.reset_auto_parallel_context() context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, parameter_broadcast=True, device_num=device_num) D.init() rank_id = args.device_id % device_num else: device_num = 1 rank_id = 0 dataset = create_transformer_dataset(epoch_count=1, rank_size=device_num, rank_id=rank_id, do_shuffle=args.do_shuffle, enable_data_sink=args.enable_data_sink, dataset_path=args.data_path) netwithloss = TransformerNetworkWithLoss(transformer_net_cfg, True) if args.checkpoint_path: parameter_dict = load_checkpoint(args.checkpoint_path) load_param_into_net(netwithloss, parameter_dict) lr = Tensor(create_dynamic_lr(schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", training_steps=dataset.get_dataset_size()*args.epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=transformer_net_cfg.hidden_size, start_decay_step=cfg.lr_schedule.start_decay_step, min_lr=cfg.lr_schedule.min_lr), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()] if args.enable_save_ckpt == "true": if device_num == 1 or (device_num > 1 and rank_id == 0): ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, keep_checkpoint_max=args.save_checkpoint_num) ckpoint_cb = ModelCheckpoint(prefix='transformer', directory=args.save_checkpoint_path, config=ckpt_config) callbacks.append(ckpoint_cb) if args.enable_lossscale == "true": scale_manager = DynamicLossScaleManager(init_loss_scale=cfg.init_loss_scale_value, scale_factor=cfg.scale_factor, scale_window=cfg.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = TransformerTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=update_cell) else: netwithgrads = TransformerTrainOneStepCell(netwithloss, optimizer=optimizer) netwithgrads.set_train(True) model = Model(netwithgrads) model.train(args.epoch_size, dataset, callbacks=callbacks, dataset_sink_mode=(args.enable_data_sink == "true"))
def test_bert_tdt(): """test bert tdt""" np.random.seed(0) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) context.set_context(enable_graph_kernel=True) ds = me_de_train_dataset() config = get_config(version='large', batch_size=16) netwithloss = BertNetworkWithLoss(config, True) optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size()*ds.get_repeat_count(), start_learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0, weight_decay=0.01) scale_window = 3 scale_manager = DynamicLossScaleManager(262144, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info("***************** BERT param name is 1 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) else: logger.info("***************** BERT param name is 2 {}".format(name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor(np.transpose(weight_value, [1, 0])) else: logger.info("***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(1, ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [12.559319, 12.333815, 12.339806, 12.350235, 12.343947, 12.830965, 12.375336, 12.973715, 12.57929, 12.7766905] error = loss_value - expect_loss_value print("loss value: {}".format(loss_value)) print("error value: {}".format(error)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [True, True, True, True, False, False, False, True, False, False] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [131072.0, 65536.0, 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def test_compile_fp16_lr_overflow_dynamic_graph(): inputs = Tensor(np.ones([16, 16]).astype(np.float32)) label = Tensor(np.zeros([16, 16]).astype(np.float32)) lr = Tensor(np.ones([1], np.float32) * 0.1) net = NetFP16(16, 16) loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9) net_with_loss = WithLossCell(net, loss) scale_manager = DynamicLossScaleManager() update_cell = scale_manager.get_update_cell() train_network = TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_update_cell=update_cell) train_network.set_train() output = train_network(inputs, label) print("the result is ", output)
def test_compile_grad_error(): inputs = Tensor(np.ones([16, 16]).astype(np.float32)) label = Tensor(np.zeros([16, 16]).astype(np.float32)) lr = Tensor(np.ones([1], np.float32) * 0.1) net = NetFP16(16, 16) loss = MSELoss() optimizer = Momentum(net.trainable_params(), learning_rate=lr, momentum=0.9) net_with_loss = WithLossCell(net, loss) scale_manager = DynamicLossScaleManager() update_cell = scale_manager.get_update_cell() train_network = TrainOneStepWithLossScaleCell(net_with_loss, optimizer, scale_update_cell=update_cell) train_network.set_train() with pytest.raises(TypeError) as e: train_network(inputs, label) print(e)
def _build_training_pipeline(config: GNMTConfig, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None): """ Build training pipeline. Args: config (GNMTConfig): Config of mass model. pre_training_dataset (Dataset): Pre-training dataset. fine_tune_dataset (Dataset): Fine-tune dataset. test_dataset (Dataset): Test dataset. """ net_with_loss = GNMTNetworkWithLoss(config, is_training=True, use_one_hot_embeddings=True) net_with_loss.init_parameters_data() _load_checkpoint_to_net(config, net_with_loss) dataset = pre_training_dataset if pre_training_dataset is not None \ else fine_tune_dataset if dataset is None: raise ValueError( "pre-training dataset or fine-tuning dataset must be provided one." ) update_steps = config.epochs * dataset.get_dataset_size() lr = _get_lr(config, update_steps) optimizer = _get_optimizer(config, net_with_loss, lr) # Dynamic loss scale. scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale, scale_factor=config.loss_scale_factor, scale_window=config.scale_window) net_with_grads = GNMTTrainOneStepWithLossScaleCell( network=net_with_loss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) loss_monitor = LossCallBack(config) dataset_size = dataset.get_dataset_size() time_cb = TimeMonitor(data_size=dataset_size) ckpt_config = CheckpointConfig( save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') callbacks = [time_cb, loss_monitor] if rank_size is not None and int( rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50) callbacks.append(summary_callback) if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) summary_callback = SummaryCollector(summary_dir="./summary", collect_freq=50) callbacks.append(summary_callback) print(f" | ALL SET, PREPARE TO TRAIN.") _train(model=model, config=config, pre_training_dataset=pre_training_dataset, fine_tune_dataset=fine_tune_dataset, test_dataset=test_dataset, callbacks=callbacks)
def test_bert_precision(enable_graph_kernel=False): """test bert precision""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) if enable_graph_kernel: context.set_context(enable_graph_kernel=True) data_set, new_repeat_count, _ = me_de_train_dataset() version = os.getenv('VERSION', 'large') config = get_config(version=version) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=data_set.get_dataset_size() * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{'params': decay_params, 'weight_decay': 0.01}, {'params': other_params}, {'order_params': netwithloss.trainable_params()}] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2 ** 16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell(netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.data name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info("***************** BERT param name is 1 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) else: logger.info("***************** BERT param name is 2 {}".format(name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.set_data(Tensor(np.transpose(weight_value, [1, 0]))) else: logger.info("***************** BERT param name is 3 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) model.train(new_repeat_count, data_set, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) if enable_graph_kernel: expect_loss_value = [12.206627, 11.840489, 11.798470, 11.796345, 11.790964, 12.366766, 11.971539, 12.576565, 12.185522, 12.386192] else: assert np.allclose(loss_value[0], 12.2066, 0, 0.0005) expect_loss_value = [12.206587, 11.966410, 11.965916, 11.975922, 11.970262, 12.608881, 12.174048, 12.840656, 12.407923, 12.631133] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [False, False, False, True, False, False, False, True, False, False] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def test_bert_performance(): """test bert performance""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) data_set, new_repeat_count, sink_size = me_de_train_dataset(sink_mode=True) version = os.getenv('VERSION', 'large') config = get_config(version=version) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=sink_size * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower( ) and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower( ) or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }, { 'order_params': netwithloss.trainable_params() }] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.data name = param.name if isinstance(value, Tensor) and not value.has_init: if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.set_data(weight_variable(value.asnumpy().shape)) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.set_data(Tensor(np.transpose(weight_value, [1, 0]))) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.set_data(weight_variable(value.asnumpy().shape)) time_monitor_callback = TimeMonitor(sink_size) model.train(new_repeat_count, data_set, callbacks=[time_monitor_callback, callback], dataset_sink_mode=True, sink_size=sink_size) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [11.3660, 11.3265, 11.3264] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [True, True, True] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [65536.0, 65536.0, 65536.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0) epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2] expect_epoch_mseconds = 1400 print("epoch mseconds: {}".format(epoch_mseconds)) assert epoch_mseconds <= expect_epoch_mseconds + 5 per_step_mseconds = np.array( time_monitor_callback.per_step_mseconds_list)[2] expect_per_step_mseconds = 14 print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 1
def test_transformer(): """ Transformer training. """ np.random.seed(1) context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") context.set_context(reserve_class_name_in_scope=False, enable_auto_mixed_precision=False) version = os.getenv('VERSION', 'large') batch_size = 96 epoch_size = 3 config = get_config(version=version, batch_size=batch_size) dataset = load_test_data(batch_size=transformer_net_cfg.batch_size, data_file=DATA_DIR) netwithloss = TransformerNetworkWithLoss(config, True) lr = Tensor( create_dynamic_lr( schedule="constant*rsqrt_hidden*linear_warmup*rsqrt_decay", training_steps=dataset.get_dataset_size() * epoch_size, learning_rate=cfg.lr_schedule.learning_rate, warmup_steps=cfg.lr_schedule.warmup_steps, hidden_size=config.hidden_size), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) callback = ModelCallback() scale_manager = DynamicLossScaleManager(init_loss_scale=4194304, scale_factor=cfg.scale_factor, scale_window=3) update_cell = scale_manager.get_update_cell() netwithgrads = TransformerTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=update_cell) netwithgrads.set_train(True) time_monitor_callback = TimeMonitor(dataset.get_dataset_size()) model = Model(netwithgrads) model.train(epoch_size, dataset, callbacks=[time_monitor_callback, callback], dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) assert np.allclose(loss_value[0], 11.241606, 0, 0.000005) expect_loss_value = [ 11.241606, 11.243232, 11.217459, 11.204157, 11.213804, 11.215373, 11.190564, 11.150393, 11.191823, 11.160045 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value[0:10], expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [ False, False, False, True, False, False, False, True, False, False ] print("overflow: {}".format(overflow)) assert (overflow[0:10] == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 4194304.0, 4194304.0, 8388608.0, 4194304.0, 4194304.0, 4194304.0, 8388608.0, 4194304.0, 4194304.0, 4194304.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale[0:10], expect_loss_scale, 0, 0) epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2] expect_epoch_mseconds = 2400 print("epoch mseconds: {}".format(epoch_mseconds)) assert epoch_mseconds <= expect_epoch_mseconds + 20 per_step_mseconds = np.array( time_monitor_callback.per_step_mseconds_list)[2] expect_per_step_mseconds = 240 print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 2
dataset = create_gru_dataset(epoch_count=config.num_epochs, batch_size=config.batch_size, dataset_path=mindrecord_file, rank_size=device_num, rank_id=rank) dataset_size = dataset.get_dataset_size() print("dataset size is {}".format(dataset_size)) network = Seq2Seq(config) network = GRUWithLossCell(network) lr = dynamic_lr(config, dataset_size) opt = Adam(network.trainable_params(), learning_rate=lr) scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale_value, scale_factor=config.scale_factor, scale_window=config.scale_window) update_cell = scale_manager.get_update_cell() netwithgrads = GRUTrainOneStepWithLossScaleCell(network, opt, update_cell) time_cb = TimeMonitor(data_size=dataset_size) loss_cb = LossCallBack(rank_id=rank) cb = [time_cb, loss_cb] #Save Checkpoint if config.save_checkpoint: ckpt_config = CheckpointConfig( save_checkpoint_steps=config.ckpt_epoch * dataset_size, keep_checkpoint_max=config.keep_checkpoint_max) save_ckpt_path = os.path.join(args.outputs_dir, 'ckpt_' + str(args.rank_id) + '/') ckpt_cb = ModelCheckpoint(config=ckpt_config, directory=save_ckpt_path, prefix='{}'.format(args.rank_id))
def _build_training_pipeline(config: TransformerConfig, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None): """ Build training pipeline. Args: config (TransformerConfig): Config of mass model. pre_training_dataset (Dataset): Pre-training dataset. fine_tune_dataset (Dataset): Fine-tune dataset. test_dataset (Dataset): Test dataset. """ net_with_loss = TransformerNetworkWithLoss(config, is_training=True) net_with_loss.init_parameters_data() if config.existed_ckpt: if config.existed_ckpt.endswith(".npz"): weights = np.load(config.existed_ckpt) else: weights = load_checkpoint(config.existed_ckpt) for param in net_with_loss.trainable_params(): weights_name = param.name if weights_name not in weights: raise ValueError( f"Param {weights_name} is not found in ckpt file.") if isinstance(weights[weights_name], Parameter): param.default_input = weights[weights_name].default_input elif isinstance(weights[weights_name], Tensor): param.default_input = Tensor(weights[weights_name].asnumpy(), config.dtype) elif isinstance(weights[weights_name], np.ndarray): param.default_input = Tensor(weights[weights_name], config.dtype) else: param.default_input = weights[weights_name] else: for param in net_with_loss.trainable_params(): name = param.name value = param.default_input if isinstance(value, Tensor): if name.endswith(".gamma"): param.default_input = one_weight(value.asnumpy().shape) elif name.endswith(".beta") or name.endswith(".bias"): param.default_input = zero_weight(value.asnumpy().shape) else: param.default_input = weight_variable( value.asnumpy().shape) dataset = pre_training_dataset if pre_training_dataset is not None \ else fine_tune_dataset if dataset is None: raise ValueError( "pre-training dataset or fine-tuning dataset must be provided one." ) update_steps = dataset.get_repeat_count() * dataset.get_dataset_size() if config.lr_scheduler == "isr": lr = Tensor(square_root_schedule( lr=config.lr, update_num=update_steps, decay_start_step=config.decay_start_step, warmup_steps=config.warmup_steps, min_lr=config.min_lr), dtype=mstype.float32) elif config.lr_scheduler == "poly": lr = Tensor(polynomial_decay_scheduler( lr=config.lr, min_lr=config.min_lr, decay_steps=config.decay_steps, total_update_num=update_steps, warmup_steps=config.warmup_steps, power=config.poly_lr_scheduler_power), dtype=mstype.float32) else: lr = config.lr if config.optimizer.lower() == "adam": optimizer = Adam(net_with_loss.trainable_params(), lr, beta1=0.9, beta2=0.98) elif config.optimizer.lower() == "lamb": lr = BertLearningRate(decay_steps=12000, learning_rate=config.lr, end_learning_rate=config.min_lr, power=10.0, warmup_steps=config.warmup_steps) decay_params = list( filter( lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x .name.lower(), net_with_loss.trainable_params())) other_params = list( filter( lambda x: 'layernorm' in x.name.lower() or 'bias' in x.name. lower(), net_with_loss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }] optimizer = Lamb(group_params, lr, eps=1e-6) elif config.optimizer.lower() == "momentum": optimizer = Momentum(net_with_loss.trainable_params(), lr, momentum=0.9) else: raise ValueError(f"optimizer only support `adam` and `momentum` now.") # Dynamic loss scale. scale_manager = DynamicLossScaleManager( init_loss_scale=config.init_loss_scale, scale_factor=config.loss_scale_factor, scale_window=config.scale_window) net_with_grads = TransformerTrainOneStepWithLossScaleCell( network=net_with_loss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) loss_monitor = LossCallBack(config) ckpt_config = CheckpointConfig( save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') callbacks = [loss_monitor] if rank_size is not None and int( rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) callbacks.append(ckpt_callback) print(f" | ALL SET, PREPARE TO TRAIN.") _train(model=model, config=config, pre_training_dataset=pre_training_dataset, fine_tune_dataset=fine_tune_dataset, test_dataset=test_dataset, callbacks=callbacks)
def test_bert_tdt(): """test bert tdt""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) ds = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = int(os.getenv('BATCH_SIZE', '16')) config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * ds.get_repeat_count(), start_learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0, weight_decay=0.01) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(ds.get_repeat_count(), ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [ 12.207198, 11.980881, 11.984844, 11.879381, 11.832978, 12.411333, 12.009284, 12.621277, 12.223178, 12.427385 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [ True, True, False, False, False, True, False, False, False, True ] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def test_bert_performance(): """test bert performance""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) ds, new_repeat_count = me_de_train_dataset(sink_mode=True) version = os.getenv('VERSION', 'large') batch_size = 16 config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) optimizer = Lamb(netwithloss.trainable_params(), decay_steps=ds.get_dataset_size() * new_repeat_count, start_learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0, weight_decay=0.01) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) time_monitor_callback = TimeMonitor(ds.get_dataset_size()) model.train(new_repeat_count, ds, callbacks=[time_monitor_callback, callback], dataset_sink_mode=True) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [10.235566, 10.207392, 10.206976] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [True, True, True] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [262144.0, 262144.0, 262144.0] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0) epoch_mseconds = np.array(time_monitor_callback.epoch_mseconds_list)[2] expect_epoch_mseconds = 1600 print("epoch mseconds: {}".format(epoch_mseconds)) assert epoch_mseconds <= expect_epoch_mseconds + 5 per_step_mseconds = np.array( time_monitor_callback.per_step_mseconds_list)[2] expect_per_step_mseconds = 16 print("per step mseconds: {}".format(per_step_mseconds)) assert per_step_mseconds <= expect_per_step_mseconds + 1
def test_bert_percision(): """test bert percision""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) ds, new_repeat_count, _ = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = 16 config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) lr = BertLearningRate(decay_steps=ds.get_dataset_size() * new_repeat_count, learning_rate=5e-5, end_learning_rate=1e-9, power=10.0, warmup_steps=0) decay_filter = lambda x: 'layernorm' not in x.name.lower( ) and 'bias' not in x.name.lower() no_decay_filter = lambda x: 'layernorm' in x.name.lower( ) or 'bias' in x.name.lower() decay_params = list(filter(decay_filter, netwithloss.trainable_params())) other_params = list(filter(no_decay_filter, netwithloss.trainable_params())) group_params = [{ 'params': decay_params, 'weight_decay': 0.01 }, { 'params': other_params }, { 'order_params': netwithloss.trainable_params() }] optimizer = Lamb(group_params, lr) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(new_repeat_count, ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) assert np.allclose(loss_value[0], 12.206575, 0, 0.000001) expect_loss_value = [ 12.206575, 11.865044, 11.828129, 11.826707, 11.82108, 12.407423, 12.005459, 12.621225, 12.222903, 12.427446 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0, 0.0005) overflow = np.array(callback.overflow_list) expect_overflow = [ False, False, False, True, False, False, False, True, False, False ] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0, 131072.0, 65536.0, 65536.0, 65536.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0, 0)
def test_bert_tdt(): """test bert tdt""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) ds = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = int(os.getenv('BATCH_SIZE', '16')) config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) optimizer = Momentum(netwithloss.trainable_params(), learning_rate=2e-5, momentum=0.9) scale_window = 3 scale_manager = DynamicLossScaleManager(2**16, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: param.init_data() value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(ds.get_repeat_count(), ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss value, overflow state or loss_scale value is wrong loss_value = np.array(callback.loss_list) expect_loss_value = [ 12.1918125, 11.966035, 11.972114, 11.982188, 11.974092, 12.610916, 12.17565, 12.840416, 12.40291, 12.621661 ] print("loss value: {}".format(loss_value)) assert np.allclose(loss_value, expect_loss_value, 0.00001, 0.00001) overflow = np.array(callback.overflow_list) expect_overflow = [ True, True, False, False, False, True, False, False, False, True ] print("overflow: {}".format(overflow)) assert (overflow == expect_overflow).all() loss_scale = np.array(callback.lossscale_list) expect_loss_scale = [ 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0, 16384.0, 16384.0, 32768.0, 16384.0 ] print("loss scale: {}".format(loss_scale)) assert np.allclose(loss_scale, expect_loss_scale, 0.00001, 0.00001)
def test_bert_tdt(): """test bert tdt""" context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", reserve_class_name_in_scope=False) context.set_context(enable_task_sink=True) context.set_context(enable_loop_sink=True) context.set_context(enable_mem_reuse=True) ds = me_de_train_dataset() version = os.getenv('VERSION', 'large') batch_size = int(os.getenv('BATCH_SIZE', '16')) config = get_config(version=version, batch_size=batch_size) netwithloss = BertNetworkWithLoss(config, True) optimizer = Momentum(netwithloss.trainable_params(), learning_rate=2e-5, momentum=0.9) scale_window = 3 scale_manager = DynamicLossScaleManager(2**32, 2, scale_window) netwithgrads = BertTrainOneStepWithLossScaleCell( netwithloss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) netwithgrads.set_train(True) model = Model(netwithgrads) callback = ModelCallback() params = netwithloss.trainable_params() for param in params: value = param.default_input name = param.name if isinstance(value, Tensor): if name.split('.')[-1] in ['weight']: if name.split('.')[-3] in ['cls2']: logger.info( "***************** BERT param name is 1 {}".format( name)) param.default_input = weight_variable( value.asnumpy().shape) else: logger.info( "***************** BERT param name is 2 {}".format( name)) tempshape = value.asnumpy().shape shape = (tempshape[1], tempshape[0]) weight_value = weight_variable(shape).asnumpy() param.default_input = Tensor( np.transpose(weight_value, [1, 0])) else: logger.info( "***************** BERT param name is 3 {}".format(name)) param.default_input = weight_variable(value.asnumpy().shape) model.train(ds.get_repeat_count(), ds, callbacks=callback, dataset_sink_mode=False) # assertion occurs while the loss_scale value is wrong count = 0 for i in range(len(callback.overflow_list)): if callback.overflow_list[i] == Tensor(True, mstype.bool_) and i > 0: count = 0 assert callback.lossscale_list[i] == callback.lossscale_list[ i - 1] * Tensor(0.5, mstype.float32) if callback.overflow_list[i] == Tensor(False, mstype.bool_): count = count + 1 if count == scale_window: count = 0 assert callback.lossscale_list[i] == callback.lossscale_list[ i - 1] * Tensor(2.0, mstype.float32)
def _build_training_pipeline(config: TransformerConfig, pre_training_dataset=None, fine_tune_dataset=None, test_dataset=None, platform="Ascend"): """ Build training pipeline. Args: config (TransformerConfig): Config of mass model. pre_training_dataset (Dataset): Pre-training dataset. fine_tune_dataset (Dataset): Fine-tune dataset. test_dataset (Dataset): Test dataset. """ net_with_loss = TransformerNetworkWithLoss(config, is_training=True) net_with_loss.init_parameters_data() _load_checkpoint_to_net(config, net_with_loss) dataset = pre_training_dataset if pre_training_dataset is not None \ else fine_tune_dataset if dataset is None: raise ValueError("pre-training dataset or fine-tuning dataset must be provided one.") update_steps = config.epochs * dataset.get_dataset_size() lr = _get_lr(config, update_steps) optimizer = _get_optimizer(config, net_with_loss, lr) # loss scale. if config.loss_scale_mode == "dynamic": scale_manager = DynamicLossScaleManager(init_loss_scale=config.init_loss_scale, scale_factor=config.loss_scale_factor, scale_window=config.scale_window) else: scale_manager = FixedLossScaleManager(loss_scale=config.init_loss_scale, drop_overflow_update=True) net_with_grads = TransformerTrainOneStepWithLossScaleCell(network=net_with_loss, optimizer=optimizer, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) time_cb = TimeMonitor(data_size=dataset.get_dataset_size()) ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') callbacks = [] callbacks.append(time_cb) if rank_size is not None and int(rank_size) > 1: loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank()) callbacks.append(loss_monitor) if MultiAscend.get_rank() % 8 == 0: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiAscend.get_rank())), config=ckpt_config) callbacks.append(ckpt_callback) if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) loss_monitor = LossCallBack(config, rank_id=os.getenv('DEVICE_ID')) callbacks.append(loss_monitor) callbacks.append(ckpt_callback) print(f" | ALL SET, PREPARE TO TRAIN.") _train(model=model, config=config, pre_training_dataset=pre_training_dataset, fine_tune_dataset=fine_tune_dataset, test_dataset=test_dataset, callbacks=callbacks)