def test_with_asp(self): fleet.init(is_collective=True) self.optimizer = paddle.incubate.asp.decorate(self.optimizer) paddle.incubate.asp.prune_model(self.layer) self.optimizer = fleet.distributed_optimizer(self.optimizer) self.layer = fleet.distributed_model(self.layer) imgs = paddle.to_tensor(np.random.randn(64, 32), dtype='float32', place=self.place, stop_gradient=False) labels = paddle.to_tensor(np.random.randint(10, size=(64, 1)), dtype='float32', place=self.place, stop_gradient=False) loss_fn = paddle.nn.MSELoss(reduction='mean') output = self.layer(imgs) loss = loss_fn(output, labels) loss.backward() self.optimizer.step() self.optimizer.clear_grad() for param in self.layer.parameters(): if ASPHelper._is_supported_layer( paddle.static.default_main_program(), param.name): mat = param.numpy() self.assertTrue( paddle.fluid.contrib.sparsity.check_sparsity(mat.T, n=2, m=4))
def build_model_optimizer(self, Optimizer="adam"): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() sharding_id = hcg.get_sharding_parallel_rank() dp_id = hcg.get_data_parallel_rank() rank_id = dist.get_rank() np_fc1 = np.random.random_sample((hidden_size, inner_size)) np_fc2 = np.random.random_sample((inner_size, hidden_size)) model_a = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2) optimizer_a = self.build_optimizer(model_a, strategy=self.strategy, is_sharding=True, Optimizer=Optimizer) model_a = fleet.distributed_model(model_a) optimizer_a = fleet.distributed_optimizer(optimizer_a) model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2) optimizer_b = self.build_optimizer(model_b, strategy=self.strategy, is_sharding=False, Optimizer=Optimizer) return model_a, optimizer_a, model_b, optimizer_b
def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() dp_id = hcg.get_data_parallel_rank() pp_id = hcg.get_stage_id() rank_id = dist.get_rank() topology = hcg.topology() set_random_seed(1024, dp_id, rank_id) model = ModelPipe(topology) scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer = paddle.optimizer.SGD(learning_rate=scheduler, parameters=model.parameters()) model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) for step_id in range(5): x_data = np.random.randint(0, vocab_size, size=[batch_size, length]) x = paddle.to_tensor(x_data) x.stop_gradient = True e_loss = model.eval_batch([x, x], True) loss = model.train_batch([x, x], optimizer, scheduler) # TODO(shenliang03) add utest for loss if pp_id != 0: np.testing.assert_allclose(loss.numpy(), e_loss.numpy())
def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() dp_id = hcg.get_data_parallel_rank() pp_id = hcg.get_stage_id() rank_id = dist.get_rank() topology = hcg.topology() set_random_seed(1024, dp_id, rank_id) model = ModelPipe(topology) scheduler = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer = paddle.optimizer.SGD(learning_rate=scheduler, parameters=model.parameters()) model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) output_dir = tempfile.mkdtemp() # warmup step for step_id in range(2): x_data = np.random.randint(0, vocab_size, size=[batch_size, length]) x = paddle.to_tensor(x_data) x.stop_gradient = True loss = model.train_batch([x, x], optimizer, scheduler) model._layers.save_state_dict(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) # construct data test_steps = 5 np_data = np.random.randint( 0, vocab_size, size=[test_steps, batch_size, length]) origin_loss = [] for step_id in range(5): x_data = np_data[step_id, :] x = paddle.to_tensor(x_data) x.stop_gradient = True loss = model.train_batch([x, x], optimizer, scheduler) origin_loss.append(loss.numpy()) # test step model._layers.set_state_dir(output_dir) opt_dict = paddle.load(os.path.join(output_dir, "model_state.pdopt")) optimizer.set_state_dict(opt_dict) for step_id in range(5): x_data = np_data[step_id, :] x = paddle.to_tensor(x_data) x.stop_gradient = True loss = model.train_batch([x, x], optimizer, scheduler) print("origin loss: ", origin_loss[step_id], "current loss: ", loss.numpy()) np.testing.assert_allclose(loss.numpy(), origin_loss[step_id]) # finally, remove the model/optimizer path shutil.rmtree(output_dir)
def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() dp_id = hcg.get_data_parallel_rank() pp_id = hcg.get_stage_id() rank_id = dist.get_rank() set_random_seed(1024, dp_id, rank_id) #construct model a model_a = AlexNet(10) scheduler_a, optimizer_a = self.build_optimizer(model_a) param_len = len(model_a.parameters()) parameters = [] for param in model_a.parameters(): parameters.append(param.numpy()) # construct model b model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size) scheduler_b, optimizer_b = self.build_optimizer(model_b) model_b = fleet.distributed_model(model_b) optimizer_b = fleet.distributed_optimizer(optimizer_b) for idx, param in enumerate(model_b.parameters()): param.set_value(parameters[idx + pp_id * (param_len // 2)]) # construct reader train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True) for step_id, data in enumerate(train_reader()): x_data = np.array([x[0] for x in data]).astype('float32').reshape( batch_size, 1, 28, 28) y_data = np.array([x[1] for x in data ]).astype('int64').reshape(batch_size, 1) img = paddle.to_tensor(x_data) label = paddle.to_tensor(y_data) img.stop_gradient = True label.stop_gradient = True if step_id >= 5: return True loss_a = model_a(img, label) loss_a.backward() optimizer_a.step() optimizer_a.clear_grad() scheduler_a.step() loss_b = model_b.train_batch([img, label], optimizer_b, scheduler_b) print("loss: ", loss_a.numpy(), loss_b.numpy()) np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy(), rtol=5e-5)
def test_dygraph_method(self): paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") a = fluid.dygraph.to_variable(value) layer = paddle.nn.Linear(13, 5) adam = paddle.optimizer.Adam(learning_rate=0.01, parameters=layer.parameters()) # remove init cause this UT cannot launch distributed task adam = fleet.distributed_optimizer(adam) dp_layer = fleet.distributed_model(layer) lr = 0.001 adam.set_lr(lr) cur_lr = adam.get_lr() assert (lr == cur_lr) state_dict = adam.state_dict() adam.set_state_dict(state_dict)
def test_dygraph_single(self): paddle.disable_static() fleet.init(is_collective=True) layer = LinearNet() loss_fn = nn.MSELoss() adam = paddle.optimizer.Adam(learning_rate=0.001, parameters=layer.parameters()) adam = fleet.distributed_optimizer(adam) dp_layer = fleet.distributed_model(layer) for step in range(2): inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) labels = paddle.randn([10, 1], 'float32') loss = loss_fn(outputs, labels) loss.backward() adam.step() adam.clear_grad()
def main(args): """ main function """ model_config = json.load(open(args.model_config, 'r')) paddle.set_device("gpu") strategy = fleet.DistributedStrategy() fleet.init(is_collective=True, strategy=strategy) eval_loader = create_dataloader(data_dir=args.eval_data, model_config=model_config) encoder_model = ProteinEncoderModel(model_config, name='protein') model = ProteinModel(encoder_model, model_config) model = fleet.distributed_model(model) model.load_dict(paddle.load(args.eval_model)) criterion = ProteinCriterion(model_config) metric = get_metric(model_config['task']) eval_cur_loss = eval(model, eval_loader, criterion, metric)
def build_model_optimizer(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() mp_id = hcg.get_model_parallel_rank() dp_id = hcg.get_data_parallel_rank() rank_id = dist.get_rank() set_random_seed(1024, dp_id, rank_id) np_fc1 = np.random.random_sample((hidden_size, inner_size)) np_fc2 = np.random.random_sample((inner_size, hidden_size)) model_a = SimpleMPNet(vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2, mp_id) optimizer_a = self.build_optimizer(model_a) model_a = fleet.distributed_model(model_a) optimizer_a = fleet.distributed_optimizer(optimizer_a) model_b = SimpleDPNet(vocab_size, hidden_size, inner_size, output_size, np_fc1, np_fc2) optimizer_b = self.build_optimizer(model_b) return model_a, optimizer_a, model_b, optimizer_b
# 1. enable dynamic mode paddle.disable_static() # 2. initialize fleet environment fleet.init(is_collective=True) # 3. create layer & optimizer layer = paddle.nn.Linear(10, 10) adam = paddle.optimizer.Adam(learning_rate=0.001, parameters=layer.parameters()) # 4. get data_parallel model using fleet adam = fleet.distributed_optimizer(adam) dp_layer = fleet.distributed_model(layer) # 5. run layer for step in range(1): inputs = paddle.randn([10, 10], 'float32') outputs = dp_layer(inputs) loss = paddle.mean(outputs) print("step:{}\tloss:{}".format(step, loss.numpy())) loss = dp_layer.scale_loss(loss) loss.backward() dp_layer.apply_collective_grads() adam.step() adam.clear_grad()
def main(args): paddle.seed(12345) # load config config = load_yaml(args.config_yaml) dy_model_class = load_dy_model_class(args.abs_dir) config["config_abs_dir"] = args.abs_dir # modify config from command if args.opt: for parameter in args.opt: parameter = parameter.strip() key, value = parameter.split("=") if type(config.get(key)) is int: value = int(value) if type(config.get(key)) is bool: value = (True if value.lower() == "true" else False) config[key] = value # tools.vars use_gpu = config.get("runner.use_gpu", True) use_xpu = config.get("runner.use_xpu", False) use_visual = config.get("runner.use_visual", False) train_data_dir = config.get("runner.train_data_dir", None) epochs = config.get("runner.epochs", None) print_interval = config.get("runner.print_interval", None) train_batch_size = config.get("runner.train_batch_size", None) model_save_path = config.get("runner.model_save_path", "model_output") model_init_path = config.get("runner.model_init_path", None) use_fleet = config.get("runner.use_fleet", False) logger.info("**************common.configs**********") logger.info( "use_gpu: {}, use_xpu: {}, use_visual: {}, train_batch_size: {}, train_data_dir: {}, epochs: {}, print_interval: {}, model_save_path: {}" .format(use_gpu, use_xpu, use_visual, train_batch_size, train_data_dir, epochs, print_interval, model_save_path)) logger.info("**************common.configs**********") if use_xpu: xpu_device = 'xpu:{0}'.format(os.getenv('FLAGS_selected_xpus', 0)) place = paddle.set_device(xpu_device) else: place = paddle.set_device('gpu' if use_gpu else 'cpu') dy_model = dy_model_class.create_model(config) # Create a log_visual object and store the data in the path if use_visual: from visualdl import LogWriter log_visual = LogWriter(args.abs_dir + "/visualDL_log/train") if model_init_path is not None: load_model(model_init_path, dy_model) # to do : add optimizer function optimizer = dy_model_class.create_optimizer(dy_model, config) # use fleet run collective if use_fleet: from paddle.distributed import fleet strategy = fleet.DistributedStrategy() fleet.init(is_collective=True, strategy=strategy) optimizer = fleet.distributed_optimizer(optimizer) dy_model = fleet.distributed_model(dy_model) logger.info("read data") train_dataloader = create_data_loader(config=config, place=place) last_epoch_id = config.get("last_epoch", -1) step_num = 0 for epoch_id in range(last_epoch_id + 1, epochs): # set train mode dy_model.train() metric_list, metric_list_name = dy_model_class.create_metrics() #auc_metric = paddle.metric.Auc("ROC") epoch_begin = time.time() interval_begin = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, batch in enumerate(train_dataloader()): train_reader_cost += time.time() - reader_start optimizer.clear_grad() train_start = time.time() batch_size = len(batch[0]) loss, metric_list, tensor_print_dict = dy_model_class.train_forward( dy_model, metric_list, batch, config) loss.backward() optimizer.step() train_run_cost += time.time() - train_start total_samples += batch_size if batch_id % print_interval == 0: metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += (metric_list_name[metric_id] + ":{:.6f}, ".format( metric_list[metric_id].accumulate())) if use_visual: log_visual.add_scalar( tag="train/" + metric_list_name[metric_id], step=step_num, value=metric_list[metric_id].accumulate()) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") if use_visual: log_visual.add_scalar(tag="train/" + var_name, step=step_num, value=var.numpy()) logger.info( "epoch: {}, batch_id: {}, ".format(epoch_id, batch_id) + metric_str + tensor_print_str + " avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} ins/s" .format( train_reader_cost / print_interval, (train_reader_cost + train_run_cost) / print_interval, total_samples / print_interval, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() step_num = step_num + 1 metric_str = "" for metric_id in range(len(metric_list_name)): metric_str += ( metric_list_name[metric_id] + ": {:.6f},".format(metric_list[metric_id].accumulate())) tensor_print_str = "" if tensor_print_dict is not None: for var_name, var in tensor_print_dict.items(): tensor_print_str += ("{}:".format(var_name) + str(var.numpy()) + ",") logger.info("epoch: {} done, ".format(epoch_id) + metric_str + tensor_print_str + " epoch time: {:.2f} s".format(time.time() - epoch_begin)) if use_fleet: trainer_id = paddle.distributed.get_rank() if trainer_id == 0: save_model(dy_model, optimizer, model_save_path, epoch_id, prefix='rec') else: save_model(dy_model, optimizer, model_save_path, epoch_id, prefix='rec')
def do_train(args): paddle.set_device(args.device) nranks = paddle.distributed.get_world_size() strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { "dp_degree": args.dp_degree, "mp_degree": args.mp_degree, "pp_degree": args.pp_degree, "sharding_degree": args.sharding_degree } accumulate_steps = args.local_batch_size // args.micro_batch_size strategy.pipeline_configs = { "accumulate_steps": accumulate_steps, "micro_batch_size": args.micro_batch_size } # set control in tensor parallel strategy.tensor_parallel_configs = {"tensor_init_seed": args.seed} fleet.init(is_collective=True, strategy=strategy) # obtain rank message of hybrid parallel hcg = fleet.get_hybrid_communicate_group() global_rank = hcg.get_global_rank() mp_rank = hcg.get_model_parallel_rank() pp_rank = hcg.get_stage_id() dp_rank = hcg.get_data_parallel_rank() sharding_rank = hcg.get_sharding_parallel_rank() # sharding stage2/3 not support hybrid parallel if args.sharding_stage in [2, 3]: assert args.dp_degree == args.mp_degree == args.pp_degree == 1, "sharding stage2/3 will support hybrid parallel later" sharding_size = hcg.get_sharding_parallel_world_size() data_world_rank = dp_rank * sharding_size + sharding_rank data_world_size = args.dp_degree * args.sharding_degree local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, data_world_rank, mp_rank, pp_rank) default_global_tokens_num = args.global_batch_size * args.max_seq_len model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) # Define log writer log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_pure_fp16_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_pure_fp16, False, global_rank).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config['num_partitions'] = args.mp_degree model_config['use_recompute'] = args.use_recompute if args.pp_degree == 1: model = GPTForPretraining(GPTModel(**model_config)) else: model_config['topology'] = hcg.topology() model = GPTForPretrainingPipe(**model_config) else: model = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob) # Create the critrion for the gpt model criterion = GPTPretrainingCriterion() if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps lr_scheduler = None if args.lr_decay_style == "none": lr_scheduler = None elif args.lr_decay_style == "cosine": lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.grad_clip) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] if args.sharding_stage == 1 and args.sharding_degree > 1: optimizer = DygraphShardingOptimizer( hcg=fleet.get_hybrid_communicate_group(), user_defined_strategy=strategy, params=model.parameters(), inner_optimizer_class=paddle.optimizer.AdamW, learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) else: optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params, # TODO: remove 'multi_precision' in definition of optimizer # and add it to 'paddle.amp.decorate' multi_precision=args.use_pure_fp16) if args.use_pure_fp16: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) # level O2 means converting the network to FP16 if args.sharding_stage not in [2, 3]: scaler = fleet.distributed_scaler(scaler) model = paddle.amp.decorate( models=model, level='O2', save_dtype='float32') # wrap sharding stage2/3 and add collective group # TODO(Baibaifan): combine ShardingStage1/2/3 and fleet.distributed_model in feature if args.sharding_stage in [2, 3]: scaler = scaler if args.use_pure_fp16 else None model, optimizer, scaler = wrap_sharding_2_3(model, optimizer, scaler, args.sharding_offload) elif paddle.distributed.get_world_size() > 1: model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt") if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) optimizer.set_state_dict(opt_dict) else: logger.warning("No optimizer checkpoint file found in %s." % opt_path) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): files = get_train_data_file(args) files.sort() num_files = len(files) for f_id in range(num_files): data_file = files[f_id] train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, [data_file], local_rank=local_rank, data_world_size=data_world_size, data_world_rank=data_world_rank, eos_id=tokenizer.eos_token_id) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() # time count train_reader_cost = 0.0 train_run_cost = 0.0 reader_start = time.time() for step, batch in enumerate(train_data_loader()): train_reader_cost += time.time() - reader_start train_start = time.time() global_step += 1 tokens, loss_mask, position_ids, labels = batch loss_mask.stop_gradient = True labels.stop_gradient = True position_ids.stop_gradient = True if args.pp_degree == 1: # In ParallelMode of DataParallel, 'no_sync' can be used for improving # performance of model by gradient accumulation. loss = 0.0 for i in range(accumulate_steps): start_index = i * args.micro_batch_size end_index = start_index + args.micro_batch_size with paddle.amp.auto_cast( args.use_pure_fp16, custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div" ], level='O2'): preds = model( tokens[start_index:end_index, :], position_ids[start_index:end_index, :]) loss_mbs = criterion( preds, labels[start_index:end_index, :], loss_mask[start_index:end_index, :]) loss_mbs = loss_mbs / accumulate_steps if args.use_pure_fp16: scaler.scale(loss_mbs).backward() else: loss_mbs.backward() loss = loss + loss_mbs if args.use_pure_fp16: if args.sharding_stage in [2, 3]: scaler.step(optimizer) scaler.update() else: scaler.minimize(optimizer, loss) else: optimizer.step() if lr_scheduler is not None: lr_scheduler.step() optimizer.clear_grad() else: data = [(tokens, position_ids), (labels, loss_mask)] with paddle.amp.auto_cast( args.use_pure_fp16, custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div" ], level='O2'): loss = model.train_batch( data, optimizer=optimizer, lr_scheduler=lr_scheduler, scaler=scaler if args.use_pure_fp16 else None) # Sync for profile time, delete it may be a little faster paddle.device.cuda.synchronize() train_run_cost += time.time() - train_start # Profile for model benchmark profiler.add_profiler_step(args.profiler_options) if global_step % args.logging_freq == 0: avg_loss = loss.numpy() speed = args.logging_freq / ( train_reader_cost + train_run_cost) avg_reader_cost = train_reader_cost / args.logging_freq logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, speed: %.2f step/s, ips: %.0f tokens/s, ips_per_card: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, avg_loss, avg_reader_cost, 1. / speed, speed, speed * default_global_tokens_num, speed * default_global_tokens_num / nranks, optimizer.get_lr())) log_writer.add_scalar("loss", float(loss), global_step) log_writer.add_scalar("learning_rate", optimizer.get_lr(), global_step) tic_train = time.time() train_reader_cost = 0.0 train_run_cost = 0.0 if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # Since the valid data broardcast to all devices, we do evaluate on all device. run_evaluate(args, valid_data_loader, model, criterion, args.eval_iters, log_writer, global_step, epoch, "valid") # TODO: 1. merge paramters while saving model. 2. ensure that the model is saved and loaded correctly # only dp_rank = 0 save model if (global_step % args.save_steps == 0 or global_step >= args.max_steps) and dp_rank == 0: model_to_save = model._layers if paddle.distributed.get_world_size( ) > 1 and args.sharding_stage not in [2, 3] else model output_dir = os.path.join(args.output_dir, "step_%d" % global_step) os.makedirs(output_dir, exist_ok=True) logger.info("Save model to %s" % output_dir) if args.pp_degree > 1: if mp_rank == 0 and sharding_rank == 0 and pp_rank == 0: tokenizer.save_pretrained(output_dir) model_to_save.save_state_dict(output_dir) paddle.save( optimizer.state_dict(), os.path.join( output_dir, "model_state_mp_{:0>2d}_sharding_{:0>2d}_pp_{:0>2d}.pdopt". format(mp_rank, sharding_rank, pp_rank))) else: if args.sharding_stage == 3: # If parameter need to convert to cpu, please add convert2cpu=True model_to_save.get_all_parameters(convert2cpu=False) if mp_rank == 0 and sharding_rank == 0: tokenizer.save_pretrained(output_dir) model_to_save.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join( output_dir, "model_state_mp_{:0>2d}_sharding_{:0>2d}.pdopt". format(mp_rank, sharding_rank))) if global_step >= args.max_steps: run_evaluate(args, test_data_loader, model, criterion, args.test_iters, log_writer, global_step, epoch, "test") logger.info("The training process is complete.") del train_data_loader return reader_start = time.time() del train_data_loader
def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" Init_mark = False model = self.model if self.cfg.get('fleet', False): model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer(self.optimizer) elif self._nranks > 1: find_unused_parameters = self.cfg[ 'find_unused_parameters'] if 'find_unused_parameters' in self.cfg else False model = paddle.DataParallel( self.model, find_unused_parameters=find_unused_parameters) # initial fp16 if self.cfg.get('fp16', False): scaler = amp.GradScaler(enable=self.cfg.use_gpu, init_loss_scaling=1024) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) if self.cfg.get('print_flops', False): self._flops(self.loader) profiler_options = self.cfg.get('profiler_options', None) self._compose_callback.on_train_begin(self.status) for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) model.train() iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id profiler.add_profiler_step(profiler_options) self._compose_callback.on_step_begin(self.status) data['epoch_id'] = epoch_id if self.cfg.get('fp16', False): with amp.auto_cast(enable=self.cfg.use_gpu): # model forward outputs = model(data) loss = outputs['loss'] # model backward scaled_loss = scaler.scale(loss) scaled_loss.backward() # in dygraph mode, optimizer.minimize is equal to optimizer.step scaler.minimize(self.optimizer, scaled_loss) else: # model forward outputs = model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() if self.cfg.get('unstructured_prune'): self.pruner.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) if self.use_ema: self.ema.update(self.model) iter_tic = time.time() # apply ema weight on model if self.use_ema: weight = copy.deepcopy(self.model.state_dict()) self.model.set_dict(self.ema.apply()) if self.cfg.get('unstructured_prune'): self.pruner.update_params() self._compose_callback.on_epoch_end(self.status) if validate and (self._nranks < 2 or self._local_rank == 0) \ and ((epoch_id + 1) % self.cfg.snapshot_epoch == 0 \ or epoch_id == self.end_epoch - 1): if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) # if validation in training is enabled, metrics should be re-init # Init_mark makes sure this code will only execute once if validate and Init_mark == False: Init_mark = True self._init_metrics(validate=validate) self._reset_metrics() with paddle.no_grad(): self.status['save_best_model'] = True self._eval_with_loader(self._eval_loader) # restore origin weight on model if self.use_ema: self.model.set_dict(weight) self._compose_callback.on_train_end(self.status)
verbose=True) clip = paddle.nn.ClipGradByValue(min=-CLIP, max=CLIP) strategy = fleet.DistributedStrategy() OPTIMIZER_decay = optim.Momentum(parameters=backbone_paras_wo_bn + head_paras_wo_bn, learning_rate=scheduler, weight_decay=WEIGHT_DECAY, momentum=MOMENTUM) OPTIMIZER_decay = fleet.distributed_optimizer(optimizer=OPTIMIZER_decay, strategy=strategy) OPTIMIZER = optim.Momentum(parameters=backbone_paras_only_bn, learning_rate=scheduler, momentum=MOMENTUM) OPTIMIZER = fleet.distributed_optimizer(optimizer=OPTIMIZER, strategy=strategy) BACKBONE = fleet.distributed_model(BACKBONE) HEAD = fleet.distributed_model(HEAD) logger.info("=" * 60) logger.info(OPTIMIZER) logger.info("Optimizer Generated") logger.info("=" * 60) # optionally resume from a checkpoint if BACKBONE_RESUME_ROOT and HEAD_RESUME_ROOT: logger.info("=" * 60) if os.path.isfile(BACKBONE_RESUME_ROOT) and os.path.isfile( HEAD_RESUME_ROOT): logger.info("Loading Backbone Checkpoint '{}'".format( BACKBONE_RESUME_ROOT)) load_weight(model=BACKBONE, weight_path=BACKBONE_RESUME_ROOT) logger.info(
ema = None if cfg.use_ema: ema = ExponentialMovingAverage(model, cfg.ema_decay) ema.register() # 分布式训练与混合精度训练 # 有疑问请参考文档https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/06_distributed_training/cluster_quick_start_cn.html _nranks = dist.get_world_size() _local_rank = dist.get_rank() use_fleet = cfg.train_cfg.get('fleet', False) use_fp16 = cfg.train_cfg.get('fp16', False) if use_fleet: # 初始化Fleet环境 fleet.init(is_collective=True) # 通过Fleet API获取分布式model,用于支持分布式训练 model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) elif _nranks > 1: find_unused_parameters = cfg.train_cfg['find_unused_parameters'] \ if 'find_unused_parameters' in cfg.train_cfg else False model = paddle.DataParallel( model, find_unused_parameters=find_unused_parameters) if use_fp16: # scaler = amp.GradScaler(enable=use_gpu, init_loss_scaling=2.**16, # incr_every_n_steps=2000, use_dynamic_loss_scaling=True) scaler = amp.GradScaler(enable=use_gpu, init_loss_scaling=1024) print('\n=============== fleet and fp16 ===============') print('use_fleet: %d' % use_fleet) print('use_fp16: %d' % use_fp16) print('_nranks: %d' % _nranks)
def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() dp_id = hcg.get_data_parallel_rank() pp_id = hcg.get_stage_id() rank_id = dist.get_rank() set_random_seed(1024, dp_id, rank_id) #construct model a model_a = SimpleNet() scheduler_a = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True) optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, parameters=model_a.parameters()) model_b = SimpleNetPipe(topology=hcg.topology()) scheduler_b = paddle.optimizer.lr.PiecewiseDecay( boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04], verbose=True) optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, parameters=model_b.parameters()) model_b = fleet.distributed_model(model_b) optimizer_b = fleet.distributed_optimizer(optimizer_b) param_len = len(model_a.parameters()) parameters = [] for param in model_a.parameters(): parameters.append(param.numpy()) model_b_params = model_b.parameters() if pp_id == 0: model_b_params[0].set_value(parameters[2]) model_b_params[1].set_value(parameters[0]) else: model_b_params[0].set_value(parameters[2]) model_b_params[1].set_value(parameters[1]) for step in range(5): x1_data = np.random.randint(0, vocab_size, size=[batch_size, 1]) x2_data = np.random.randint(0, vocab_size, size=[batch_size, 1]) y1_data = np.random.randint(0, hidden_size, size=[batch_size, 1]) x1 = paddle.to_tensor(x1_data) x2 = paddle.to_tensor(x2_data) y1 = paddle.to_tensor(y1_data) x1.stop_gradient = True x2.stop_gradient = True y1.stop_gradient = True loss_a = model_a(x1, x2, y1) loss_a.backward() optimizer_a.step() optimizer_a.clear_grad() scheduler_a.step() loss_b = model_b.train_batch([(x1, x2), (y1, )], optimizer_b, scheduler_b) print("loss", loss_a.numpy(), loss_b.numpy()) np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy())
def do_train(args): paddle.set_device(args.device) worker_index = paddle.distributed.get_rank() worker_num = paddle.distributed.get_world_size() local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) if worker_num > 1: paddle.distributed.init_parallel_env() if args.dp_degree * args.sharding_degree == 1: args.dp_degree = worker_num args.sharding_degree = 1 args_post_process(args, worker_num) logger.info('{:20}:{}'.format("paddle commit id", paddle.version.commit)) for arg in vars(args): logger.info('{:20}:{}'.format(arg, getattr(args, arg))) strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { "dp_degree": args.dp_degree, "mp_degree": 1, "pp_degree": 1, "sharding_degree": 1 } fleet.init(is_collective=True, strategy=strategy) hcg = fleet.get_hybrid_communicate_group() # Create the random seed for the worker set_seed(args) assert args.dp_degree * args.sharding_degree == worker_num, \ "The product of degree num should be equal to worker_num." # Create log write, log_writer = None if worker_index == 0: log_writer = LogWriter(os.path.join(args.output_dir, default_logdir())) # Define the input data in the static mode base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[ args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) # load config in checkpoint global_step = 0 consumed_samples = 0 checkpoint_dir = os.path.join(args.output_dir, "model_last") if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): with open(os.path.join(checkpoint_dir, "./config.yml"), "r") as f: step_config = yaml.load(f, Loader=yaml.FullLoader) assert step_config[ "global_batch_size"] == args.global_batch_size, "Please ensure checkpoint global batch size is the same. Folder: {}".format( checkpoint_dir) consumed_samples = step_config["consumed_samples"] global_step = step_config["global_step"] if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model = model_class(base_class(**model_config)) else: model = model_class.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob) criterion = criterion_class() if worker_index == 0: # log the model config and args model_config_json = json.dumps(model.get_model_config(), ensure_ascii=False, indent=2) log_writer.add_text("model_config", model_config_json) args_dict = {"paddle commit id": str(paddle.version.commit)} for arg in vars(args): args_dict[arg] = str(getattr(args, arg)) log_writer.add_text("args", json.dumps(args_dict, indent=2)) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps assert args.warmup_rate <= 1.0 and args.warmup_rate >= 0.0, "warmup_rate should be in [0, 1]" args.warmup_steps = args.warmup_rate * args.max_steps lr_scheduler = LinearAnnealingWithWarmupDecay( args.max_lr, args.min_lr, warmup_step=args.warmup_steps, decay_step=args.decay_steps, last_epoch=global_step) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] logger.info("Using paddle.optimizer.AdamW.") optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_param, multi_precision=args.use_amp) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) scaler = fleet.distributed_scaler(scaler) model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') if paddle.distributed.get_world_size() > 1: model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) data_file = get_train_data_file(args) train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, tokenizer, data_world_size=worker_num, data_world_rank=worker_index, max_seq_len=args.max_seq_len, current_step=global_step) # load checkpoint vars if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): logger.info("Try to load checkpoint from %s " % checkpoint_dir) opt_path = os.path.join(checkpoint_dir, "model_state.pdopt") params_path = os.path.join(checkpoint_dir, "model_state.pdparams") if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) optimizer.set_state_dict(opt_dict) model_dict = paddle.load(params_path) model.set_state_dict(model_dict) else: logger.warning("No optimizer checkpoint file found in %s." % opt_path) logger.info( "Checkpoint loaded from global step: {}".format(global_step)) loss_global = { "loss": paddle.to_tensor(0.0), "lm_loss": paddle.to_tensor(0.0), "sop_loss": paddle.to_tensor(0.0), } tic_train = time.time() while True: # If not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() # time count train_reader_cost = 0.0 train_run_cost = 0.0 reader_start = time.time() for step, batch in enumerate(train_data_loader()): train_reader_cost += time.time() - reader_start train_start = time.time() # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels input_ids, segment_ids, input_mask, masked_lm_positions, \ masked_lm_labels, next_sentence_labels = batch with paddle.amp.auto_cast(args.use_amp, custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div" ], level='O2'): # Create the model for the ernie pretrain prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, position_ids=None, attention_mask=input_mask, masked_positions=masked_lm_positions) lm_loss, sop_loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) loss = lm_loss + sop_loss if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() optimizer.clear_grad() train_run_cost += time.time() - train_start # Skip for accumulate_steps in global step if (step + 1) % args.accumulate_steps != 0: continue global_step += 1 loss_global["loss"] += loss.detach() loss_global["lm_loss"] += lm_loss.detach() loss_global["sop_loss"] += sop_loss.detach() if global_step % args.logging_freq == 0: log_info_dict = dict() log_info_dict["global_step"] = global_step for k, v in loss_global.items(): log_info_dict[k] = all_gather(v) / args.logging_freq v.subtract_(v) if worker_index == 0: speed = args.logging_freq / (time.time() - tic_train) log_info_dict["learning_rate"] = lr_scheduler.get_lr() log_info_dict["steps_per_second"] = speed log_info_dict[ "samples_per_second"] = speed * args.global_batch_size for k, v in log_info_dict.items(): log_writer.add_scalar("train/%s" % k, v, global_step) common_loginfo = "global step %d, loss: %.9f, lm_loss: %.6f, sop_loss: %.6f, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % ( global_step, log_info_dict["loss"], log_info_dict["lm_loss"], log_info_dict["sop_loss"], speed, log_info_dict["samples_per_second"], log_info_dict["learning_rate"]) addition_info = "" if args.use_amp: amp_info = { "loss_scaling": scaler._scale.item(), "incr_count": scaler._incr_count, "decr_count": scaler._decr_count } addition_info = ", ".join("%s: %d" % (k, v) for k, v in amp_info.items()) addition_info = " " + addition_info for k, v in amp_info.items(): log_writer.add_scalar("amp/%s" % k, v, global_step) logger.info(common_loginfo + addition_info) tic_train = time.time() if lr_scheduler is not None: lr_scheduler.step() if global_step % args.eval_freq == 0: # TODO, check the input data of validation run_evaluate(valid_data_loader, model, criterion, args.eval_iters, log_writer, global_step, args, task_name="valid") tic_train = time.time() def save_ckpt(output_dir, model, tokenizer, args, global_step): step_config = { "model_name": args.model_name_or_path, "global_step": global_step, "global_batch_size": args.global_batch_size, "consumed_samples": global_step * args.global_batch_size, } logger.debug("saving models to {}".format(output_dir)) model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) paddle.save(optimizer.state_dict(), os.path.join(output_dir, "model_state.pdopt")) with open(os.path.join(output_dir, "config.yml"), "w") as f: yaml.dump(step_config, f, encoding='utf-8', allow_unicode=True) if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if worker_index == 0: save_ckpt(output_dir, model, tokenizer, args, global_step) if worker_num > 1: paddle.distributed.barrier() tic_train = time.time() if global_step % args.checkpoint_steps == 0: output_dir = os.path.join(args.output_dir, "model_last") if worker_index == 0: if not os.path.exists(output_dir): os.mkdir(output_dir) output_dir_bak = os.path.join(args.output_dir, "model_last_bak") if os.path.exists(output_dir): if os.path.exists(output_dir_bak): shutil.rmtree(output_dir_bak) shutil.move(output_dir, output_dir_bak) os.mkdir(output_dir) save_ckpt(output_dir, model, tokenizer, args, global_step) if worker_num > 1: paddle.distributed.barrier() if global_step >= args.max_steps: run_evaluate(test_data_loader, model, criterion, args.test_iters, log_writer, global_step, args, task_name="test") del train_data_loader return
def train(self, validate=False): assert self.mode == 'train', "Model not in 'train' mode" # if no given weights loaded, load backbone pretrain weights as default if not self._weights_loaded: self.load_weights(self.cfg.pretrain_weights) model = self.model if self.cfg.fleet: model = fleet.distributed_model(model) self.optimizer = fleet.distributed_optimizer( self.optimizer).user_defined_optimizer elif self._nranks > 1: model = paddle.DataParallel(self.model) # initial fp16 if self.cfg.fp16: scaler = amp.GradScaler(enable=self.cfg.use_gpu, init_loss_scaling=1024) self.status.update({ 'epoch_id': self.start_epoch, 'step_id': 0, 'steps_per_epoch': len(self.loader) }) self.status['batch_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['data_time'] = stats.SmoothedValue(self.cfg.log_iter, fmt='{avg:.4f}') self.status['training_staus'] = stats.TrainingStats(self.cfg.log_iter) for epoch_id in range(self.start_epoch, self.cfg.epoch): self.status['mode'] = 'train' self.status['epoch_id'] = epoch_id self._compose_callback.on_epoch_begin(self.status) self.loader.dataset.set_epoch(epoch_id) model.train() iter_tic = time.time() for step_id, data in enumerate(self.loader): self.status['data_time'].update(time.time() - iter_tic) self.status['step_id'] = step_id self._compose_callback.on_step_begin(self.status) if self.cfg.fp16: with amp.auto_cast(enable=self.cfg.use_gpu): # model forward outputs = model(data) loss = outputs['loss'] # model backward scaled_loss = scaler.scale(loss) scaled_loss.backward() # in dygraph mode, optimizer.minimize is equal to optimizer.step scaler.minimize(self.optimizer, scaled_loss) else: # model forward outputs = model(data) loss = outputs['loss'] # model backward loss.backward() self.optimizer.step() curr_lr = self.optimizer.get_lr() self.lr.step() self.optimizer.clear_grad() self.status['learning_rate'] = curr_lr if self._nranks < 2 or self._local_rank == 0: self.status['training_staus'].update(outputs) self.status['batch_time'].update(time.time() - iter_tic) self._compose_callback.on_step_end(self.status) iter_tic = time.time() self._compose_callback.on_epoch_end(self.status) if validate and (self._nranks < 2 or self._local_rank == 0) \ and (epoch_id % self.cfg.snapshot_epoch == 0 \ or epoch_id == self.end_epoch - 1): if not hasattr(self, '_eval_loader'): # build evaluation dataset and loader self._eval_dataset = self.cfg.EvalDataset self._eval_batch_sampler = \ paddle.io.BatchSampler( self._eval_dataset, batch_size=self.cfg.EvalReader['batch_size']) self._eval_loader = create('EvalReader')( self._eval_dataset, self.cfg.worker_num, batch_sampler=self._eval_batch_sampler) with paddle.no_grad(): self._eval_with_loader(self._eval_loader)
def train_mlp(model, sharding_stage, batch_size=100, use_pure_fp16=False, accumulate_grad=False, opt_group=False, save_model=False): if sharding_stage == "dp": hcg = fleet.get_hybrid_communicate_group() group = hcg.get_check_parallel_group() else: group = paddle.distributed.new_group([0, 1]) if opt_group: optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group) else: optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16) if sharding_stage == 2: optimizer = ShardingOptimizerStage2(params=model.parameters(), optim=optimizer, group=group) model = ShardingStage2(model, optimizer, group=group, buffer_max_size=2**21) else: optimizer = fleet.distributed_optimizer(optimizer) model = fleet.distributed_model(model) train_reader = paddle.batch(reader_decorator(), batch_size=batch_size, drop_last=True) train_loader = paddle.io.DataLoader.from_generator(capacity=32, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True) train_loader.set_sample_list_generator(train_reader) if sharding_stage == 2: model.to(device="gpu") for eop in range(epoch): model.train() for batch_id, data in enumerate(train_loader()): img, label = data label.stop_gradient = True img.stop_gradient = True out = model(img) loss = paddle.nn.functional.cross_entropy(input=out, label=label) avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) if batch_size == 20: avg_loss = avg_loss / 5 avg_loss.backward() if not accumulate_grad: optimizer.step() optimizer.clear_grad() if accumulate_grad: optimizer.step() optimizer.clear_grad() if save_model: return model, optimizer return model.parameters()
if __name__ == '__main__': print(config.config) train_dataset = Loader(path=config.dataset) name_datasets = config.dataset.split('/')[-1] Recmodel = NGCF(config.config, train_dataset) if config.config['multigpu']: print('using fleet multigpu training', Recmodel) dist.init_parallel_env() Recmodel = paddle.DataParallel(Recmodel) if config.config['multicpu']: fleet.init(is_collective=True) optimizer = fleet.distributed_optimizer(optimizer) Recmodel = fleet.distributed_model(Recmodel) print('using fleet multicpu training', Recmodel) Neg_k = 1 bpr = BPRLoss(Recmodel, config.config) f = open(f'logger/train_logger_{name_datasets}.txt', 'w') f_test = open(f'logger/test_logger_{name_datasets}.txt', 'w') for epoch in range(config.TRAIN_epochs): if epoch % 10 == 0: cprint("[TEST]") preds = predict(train_dataset, Recmodel, epoch, multigpu=config.config['multigpu'], multicpu=config.config['multicpu']) result = Test(train_dataset,
def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() dp_id = hcg.get_data_parallel_rank() pp_id = hcg.get_stage_id() rank_id = dist.get_rank() set_random_seed(1024, dp_id, rank_id) grad_clip = paddle.nn.ClipGradByGlobalNorm(1.0) #construct model a model_a = AlexNet(10) scheduler_a = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, grad_clip=grad_clip, parameters=model_a.parameters()) scaler_a = paddle.amp.GradScaler(init_loss_scaling=2**5) param_len = len(model_a.parameters()) parameters = [] for param in model_a.parameters(): parameters.append(param.numpy()) # construct model b model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size) scheduler_b = paddle.optimizer.lr.PiecewiseDecay(boundaries=[2], values=[0.001, 0.002], verbose=True) optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, grad_clip=grad_clip, parameters=model_b.parameters()) model_b = fleet.distributed_model(model_b) optimizer_b = fleet.distributed_optimizer(optimizer_b) scaler_b = paddle.amp.GradScaler(init_loss_scaling=2**5) scaler_b = fleet.distributed_scaler(scaler_b) for idx, param in enumerate(model_b.parameters()): param.set_value(parameters[idx + pp_id * (param_len // 2)]) # construct reader train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=batch_size, drop_last=True) for step_id, data in enumerate(train_reader()): x_data = np.array([x[0] for x in data]).astype('float32').reshape( batch_size, 1, 28, 28) y_data = np.array([x[1] for x in data ]).astype('int64').reshape(batch_size, 1) img = paddle.to_tensor(x_data) label = paddle.to_tensor(y_data) img.stop_gradient = True label.stop_gradient = True if step_id >= 5: return True with paddle.amp.auto_cast(): loss_a = model_a(img, label) scaler_a.scale(loss_a).backward() scaler_a.minimize(optimizer_a, loss_a) optimizer_a.clear_grad() scheduler_a.step() with paddle.amp.auto_cast(): loss_b = model_b.train_batch([img, label], optimizer_b, scheduler_b, scaler=scaler_b) print("loss: ", loss_a.numpy(), loss_b.numpy()) np.testing.assert_allclose(loss_a.numpy(), loss_b.numpy(), rtol=5e-5)
def distill_train(distill_model, train_dataset, val_dataset=None, optimizer=None, save_dir='output', iters=10000, batch_size=2, resume_model=None, save_interval=1000, log_iters=10, num_workers=0, use_vdl=False, losses=None, distill_losses=None, keep_checkpoint_max=5, test_config=None, fp16=False): """ Launch training. Args: distill_model (nn.Layer): A distill model. train_dataset (paddle.io.Dataset): Used to read and process training datasets. val_dataset (paddle.io.Dataset, optional): Used to read and process validation datasets. optimizer (paddle.optimizer.Optimizer): The optimizer. save_dir (str, optional): The directory for saving the model snapshot. Default: 'output'. iters (int, optional): How may iters to train the model. Defualt: 10000. batch_size (int, optional): Mini batch size of one gpu or cpu. Default: 2. resume_model (str, optional): The path of resume model. save_interval (int, optional): How many iters to save a model snapshot once during training. Default: 1000. log_iters (int, optional): Display logging information at every log_iters. Default: 10. num_workers (int, optional): Num workers for data loader. Default: 0. use_vdl (bool, optional): Whether to record the data to VisualDL during training. Default: False. losses (dict): A dict including 'types' and 'coef'. The length of coef should equal to 1 or len(losses['types']). The 'types' item is a list of object of paddleseg.models.losses while the 'coef' item is a list of the relevant coefficient. distill_losses (dict): A dict including 'types' and 'coef'. The format of distill_losses is the same as losses. keep_checkpoint_max (int, optional): Maximum number of checkpoints to save. Default: 5. test_config(dict, optional): Evaluation config. fp16 (bool, optional): Whether to use amp. Not support for now. """ if fp16: raise RuntimeError("Distillation doesn't support amp training.") nranks = paddle.distributed.ParallelEnv().nranks local_rank = paddle.distributed.ParallelEnv().local_rank student_model = distill_model._student_models start_iter = 0 if resume_model is not None: start_iter = resume(student_model, optimizer, resume_model) if not os.path.isdir(save_dir): if os.path.exists(save_dir): os.remove(save_dir) os.makedirs(save_dir) if nranks > 1: strategy = fleet.DistributedStrategy() strategy.find_unused_parameters = True fleet.init(is_collective=True, strategy=strategy) optimizer = fleet.distributed_optimizer( optimizer) # The return is Fleet object ddp_distill_model = fleet.distributed_model(distill_model) batch_sampler = paddle.io.DistributedBatchSampler(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True) loader = paddle.io.DataLoader( train_dataset, batch_sampler=batch_sampler, num_workers=num_workers, return_list=True, worker_init_fn=worker_init_fn, ) if fp16: logger.info('use amp to train') scaler = paddle.amp.GradScaler(init_loss_scaling=1024) if use_vdl: from visualdl import LogWriter log_writer = LogWriter(save_dir) avg_loss = 0.0 avg_out_loss = 0.0 avg_out_distill_loss = 0.0 avg_feature_distill_loss = 0.0 avg_out_loss_list = [] iters_per_epoch = len(batch_sampler) best_mean_iou = -1.0 best_model_iter = -1 reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() save_models = deque() batch_start = time.time() iter = start_iter while iter < iters: for data in loader: iter += 1 if iter > iters: break reader_cost_averager.record(time.time() - batch_start) images = data[0] labels = data[1].astype('int64') edges = None if len(data) == 3: edges = data[2].astype('int64') if hasattr(distill_model, 'data_format') and distill_model.data_format == 'NHWC': images = images.transpose((0, 2, 3, 1)) if fp16: with paddle.amp.auto_cast( enable=True, custom_white_list={ "elementwise_add", "batch_norm", "sync_batch_norm" }, custom_black_list={'bilinear_interp_v2'}): if nranks > 1: logits_list = ddp_distill_model(images) else: logits_list = distill_model(images) loss_list = loss_computation(logits_list=logits_list, labels=labels, losses=losses, edges=edges) loss = sum(loss_list) scaled = scaler.scale(loss) # scale the loss scaled.backward() # do backward if isinstance(optimizer, fleet.Fleet): scaler.minimize(optimizer.user_defined_optimizer, scaled) else: scaler.minimize(optimizer, scaled) # update parameters else: if nranks > 1: s_logits_list, t_logits_list, feature_distill_loss = ddp_distill_model( images) else: s_logits_list, t_logits_list, feature_distill_loss = distill_model( images) out_loss_list = loss_computation(logits_list=s_logits_list, labels=labels, losses=losses, edges=edges) out_loss = sum(out_loss_list) out_distill_loss_list = distill_loss_computation( student_logits_list=s_logits_list, teacher_logits_list=t_logits_list, labels=labels, losses=distill_losses, edges=edges) out_distill_loss = sum(out_distill_loss_list) loss = out_loss + out_distill_loss + feature_distill_loss loss.backward() optimizer.step() lr = optimizer.get_lr() # update lr if isinstance(optimizer, fleet.Fleet): lr_sche = optimizer.user_defined_optimizer._learning_rate else: lr_sche = optimizer._learning_rate if isinstance(lr_sche, paddle.optimizer.lr.LRScheduler): lr_sche.step() distill_model.clear_gradients() avg_loss += loss.numpy()[0] avg_out_loss += out_loss.numpy()[0] avg_out_distill_loss += out_distill_loss.numpy()[0] avg_feature_distill_loss += feature_distill_loss.numpy()[0] if not avg_out_loss_list: avg_out_loss_list = [l.numpy() for l in out_loss_list] else: for i in range(len(out_loss_list)): avg_out_loss_list[i] += out_loss_list[i].numpy() batch_cost_averager.record(time.time() - batch_start, num_samples=batch_size) if (iter) % log_iters == 0 and local_rank == 0: avg_loss /= log_iters avg_out_loss /= log_iters avg_out_distill_loss /= log_iters avg_feature_distill_loss /= log_iters avg_out_loss_list = [ l[0] / log_iters for l in avg_out_loss_list ] remain_iters = iters - iter avg_train_batch_cost = batch_cost_averager.get_average() avg_train_reader_cost = reader_cost_averager.get_average() eta = calculate_eta(remain_iters, avg_train_batch_cost) logger.info( "[TRAIN] epoch: {}, iter: {}/{}, loss: {:.4f}, out_loss: {:.4f}, out_distill_loss: {:.4f}, feature_distill_loss: {:.4f}, lr: {:.6f}, batch_cost: {:.4f}, reader_cost: {:.5f}, ips: {:.4f} samples/sec | ETA {}" .format((iter - 1) // iters_per_epoch + 1, iter, iters, avg_loss, avg_out_loss, avg_out_distill_loss, avg_feature_distill_loss, lr, avg_train_batch_cost, avg_train_reader_cost, batch_cost_averager.get_ips_average(), eta)) if use_vdl: log_writer.add_scalar('Train/loss', avg_loss, iter) # Record all losses if there are more than 2 losses. if len(avg_out_loss_list) > 1: avg_loss_dict = {} for i, value in enumerate(avg_out_loss_list): avg_loss_dict['loss_' + str(i)] = value for key, value in avg_loss_dict.items(): log_tag = 'Train/' + key log_writer.add_scalar(log_tag, value, iter) log_writer.add_scalar('Train/lr', lr, iter) log_writer.add_scalar('Train/batch_cost', avg_train_batch_cost, iter) log_writer.add_scalar('Train/reader_cost', avg_train_reader_cost, iter) avg_loss = 0.0 avg_out_loss = 0.0 avg_out_distill_loss = 0.0 avg_feature_distill_loss = 0.0 avg_out_loss_list = [] reader_cost_averager.reset() batch_cost_averager.reset() if (iter % save_interval == 0 or iter == iters) and (val_dataset is not None): num_workers = 1 if num_workers > 0 else 0 if test_config is None: test_config = {} mean_iou, acc, _, _, _ = evaluate(student_model, val_dataset, num_workers=num_workers, **test_config) student_model.train() if (iter % save_interval == 0 or iter == iters) and local_rank == 0: current_save_dir = os.path.join(save_dir, "iter_{}".format(iter)) if not os.path.isdir(current_save_dir): os.makedirs(current_save_dir) paddle.save(student_model.state_dict(), os.path.join(current_save_dir, 'model.pdparams')) paddle.save(optimizer.state_dict(), os.path.join(current_save_dir, 'model.pdopt')) save_models.append(current_save_dir) if len(save_models) > keep_checkpoint_max > 0: model_to_remove = save_models.popleft() shutil.rmtree(model_to_remove) if val_dataset is not None: if mean_iou > best_mean_iou: best_mean_iou = mean_iou best_model_iter = iter best_model_dir = os.path.join(save_dir, "best_model") paddle.save( student_model.state_dict(), os.path.join(best_model_dir, 'model.pdparams')) logger.info( '[EVAL] The model with the best validation mIoU ({:.4f}) was saved at iter {}.' .format(best_mean_iou, best_model_iter)) if use_vdl: log_writer.add_scalar('Evaluate/mIoU', mean_iou, iter) log_writer.add_scalar('Evaluate/Acc', acc, iter) batch_start = time.time() # Calculate flops. if local_rank == 0: def count_syncbn(m, x, y): x = x[0] nelements = x.numel() m.total_ops += int(2 * nelements) _, c, h, w = images.shape flops = paddle.flops( student_model, [1, c, h, w], custom_ops={paddle.nn.SyncBatchNorm: count_syncbn}) # Sleep for half a second to let dataloader release resources. time.sleep(0.5) if use_vdl: log_writer.close()
def main(args): """ main function """ model_config = json.load(open(args.model_config, 'r')) if args.use_cuda: paddle.set_device("gpu") else: paddle.set_device("cpu") if args.is_distributed: strategy = fleet.DistributedStrategy() fleet.init(is_collective=args.use_cuda, strategy=strategy) train_loader = create_dataloader( data_dir=args.train_data, model_config=model_config) valid_loader = create_dataloader( data_dir=args.valid_data, model_config=model_config) encoder_model = ProteinEncoderModel(model_config, name='protein') model = ProteinModel(encoder_model, model_config) if args.is_distributed: model = fleet.distributed_model(model) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] grad_clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0) optimizer = paddle.optimizer.AdamW( learning_rate=1e-4, epsilon=1e-06, weight_decay=0.01, parameters=model.parameters(), apply_decay_param_fun=lambda x: x in decay_params) if args.is_distributed: optimizer = fleet.distributed_optimizer(optimizer) criterion = ProteinCriterion(model_config) metric = get_metric(model_config['task']) if args.init_model: print("load init_model") # for hot_start if args.hot_start == 'hot_start': model.load_dict(paddle.load(args.init_model)) # for pre_train else: encoder_model.load_dict(paddle.load(args.init_model)) train_sum_loss = 0 valid_min_loss = 10000 steps_per_epoch = 20 cur_step = 0 while True: model.train() for (text, pos, label) in train_loader: # print("text: ", text) cur_step += 1 pred = model(text, pos) label = label.reshape([-1, 1]) pred = pred.reshape([-1, pred.shape[-1]]) loss = criterion.cal_loss(pred, label) print("loss: ", loss) train_sum_loss += loss.numpy() loss.backward() optimizer.minimize(loss) model.clear_gradients() pred = pred.numpy() label = label.numpy() loss = loss.numpy() metric.update(pred, label, loss) if cur_step % 10 == 0: print('step %d, avg loss %.5f' % (cur_step, train_sum_loss / 10)) metric.show() train_sum_loss = 0 metric.clear() # save best_model if cur_step % steps_per_epoch == 0: print("eval begin_time: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) valid_cur_loss = eval(model, valid_loader, criterion, metric) print("valid_cur_loss: ", valid_cur_loss) print("eval end_time: ", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) if valid_cur_loss < valid_min_loss: print("%s Save best model step_%d." % \ (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), cur_step)) paddle.save(encoder_model.state_dict(), 'models/epoch_best_encoder.pdparams') paddle.save(model.state_dict(), 'models/epoch_best.pdparams') valid_min_loss = valid_cur_loss os.system("cp -rf models/epoch_best.pdparams models/step_%d.pdparams" % (cur_step)) os.system("cp -rf models/epoch_best_encoder.pdparams models/step_%d_encoder.pdparams" % (cur_step)) model.train()
def do_train(args): paddle.set_device(args.device) strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { "dp_degree": args.dp_degree, "mp_degree": args.mp_degree, "pp_degree": args.pp_degree } strategy.pipeline_configs = { "accumulate_steps": args.local_batch_size // args.micro_batch_size, "micro_batch_size": args.micro_batch_size } fleet.init(is_collective=True, strategy=strategy) # obtain rank message of hybrid parallel hcg = fleet.get_hybrid_communicate_group() global_rank = hcg.get_global_rank() mp_rank = hcg.get_model_parallel_rank() pp_rank = hcg.get_stage_id() dp_rank = hcg.get_data_parallel_rank() local_rank = int(os.getenv("PADDLE_RANK_IN_NODE", 0)) # seed control in hybrid parallel set_hyrbid_parallel_seed(args.seed, dp_rank, mp_rank, pp_rank) default_global_tokens_num = args.global_batch_size * args.max_seq_len model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) # Define log writer log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, False, global_rank).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config['num_partitions'] = args.mp_degree if args.pp_degree == 1: model = GPTForPretraining(GPTModel(**model_config)) else: model_config['topology'] = hcg.topology() model_config["recompute_interval"] = 1 if args.use_recompute else 0 model = GPTForPretrainingPipe(**model_config) else: model = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob) # Create the critrion for the gpt model criterion = GPTPretrainingCriterion() if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps lr_scheduler = None if args.lr_decay_style == "none": lr_scheduler = None elif args.lr_decay_style == "cosine": lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=args.grad_clip) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler if lr_scheduler is not None else args.max_lr, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, grad_clip=clip, apply_decay_param_fun=lambda x: x in decay_params) if paddle.distributed.get_world_size() > 1: model = fleet.distributed_model(model) optimizer = fleet.distributed_optimizer(optimizer) if args.use_amp: scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss) scaler = fleet.distributed_scaler(scaler) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) opt_path = os.path.join(args.model_name_or_path, "model_state.pdopt") if os.path.exists(opt_path): opt_dict = paddle.load(opt_path) optimizer.set_state_dict(opt_dict) else: logger.warning("No optimizer checkpoint file found in %s." % opt_path) global_step = 0 tic_train = time.time() for epoch in range(args.num_train_epochs): files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if (os.path.isfile(os.path.join(args.input_dir, f)) and "npz_" not in str(f)) ] files.sort() num_files = len(files) for f_id in range(num_files): data_file = files[f_id] train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, local_rank=local_rank, data_world_size=args.dp_degree, data_world_rank=dp_rank, eos_id=tokenizer.eos_token_id) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): global_step += 1 tokens, loss_mask, labels = batch loss_mask.stop_gradient = True labels.stop_gradient = True if args.pp_degree == 1: with paddle.amp.auto_cast( args.use_amp, custom_white_list=[ "layer_norm", "softmax", "gelu" ], custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "c_embedding" ]): preds = model(tokens) loss = criterion(preds, labels, loss_mask) if args.use_amp: scaler.scale(loss).backward() scaler.minimize(optimizer, loss) else: loss.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() optimizer.clear_grad() else: data = [tokens, (labels, loss_mask)] with paddle.amp.auto_cast( args.use_amp, custom_white_list=[ "layer_norm", "softmax", "gelu" ], custom_black_list=[ "reduce_sum", "c_softmax_with_cross_entropy", "c_embedding" ]): loss = model.train_batch( data, optimizer=optimizer, lr_scheduler=lr_scheduler, scaler=scaler if args.use_amp else None) if global_step % args.logging_freq == 0: avg_loss = loss.numpy() speed = args.logging_freq / (time.time() - tic_train) logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, speed: %.2f step/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, avg_loss, speed, speed * default_global_tokens_num, optimizer.get_lr())) log_writer.add_scalar("loss", float(loss), global_step) log_writer.add_scalar("learning_rate", optimizer.get_lr(), global_step) tic_train = time.time() if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # Since the valid data broardcast to all devices, we do evaluate on all device. run_evaluate(args, valid_data_loader, model, criterion, args.eval_iters, log_writer, global_step, epoch, "valid") # only dp_rank = 0 save model if (global_step % args.save_steps == 0 or global_step >= args.max_steps) and dp_rank == 0: model_to_save = model._layers if paddle.distributed.get_world_size( ) > 1 else model output_dir = os.path.join(args.output_dir, "step_%d" % global_step) os.makedirs(output_dir, exist_ok=True) logger.info("Save model to %s" % output_dir) if args.pp_degree > 1: model_to_save.save_state_dict(output_dir) if mp_rank * pp_rank == 1: tokenizer.save_pretrained(output_dir) paddle.save( optimizer.state_dict(), os.path.join( output_dir, "model_state_mp_{:0>2d}_pp_{:0>2d}.pdopt". format(mp_rank, pp_rank))) else: path = os.path.join(output_dir, 'model_{:0>2d}'.format(mp_rank)) os.makedirs(path, exist_ok=True) model_to_save.save_pretrained(path) paddle.save(optimizer.state_dict(), os.path.join(path, "model_state.pdopt")) tokenizer.save_pretrained(path) if global_step >= args.max_steps: run_evaluate(args, test_data_loader, model, criterion, args.test_iters, log_writer, global_step, epoch, "test") logger.info("The training process is complete.") del train_data_loader return del train_data_loader