def url_hash(u): h = hashlib.sha1() try: u = u.encode("utf-8") except UnicodeDecodeError: logger.error("Cannot hash url: %s", u) h.update(u) return h.hexdigest()
def on_error(ws, error): """ For error :param ws: :param error: json :return: """ logger.error("error: " + str(error))
def _find_files(self, dl_paths, publisher, url_dict): """Find files corresponding to urls.""" if publisher == "cnn": top_dir = os.path.join(dl_paths["cnn"], "stories") elif publisher == "dailymail": top_dir = os.path.join(dl_paths["dailymail"], "stories") else: logger.error("Unsupported publisher: %s", publisher) files = sorted(os.listdir(top_dir)) ret_files = [] for p in files: if self._get_hash_from_path(p) in url_dict: ret_files.append(os.path.join(top_dir, p)) return ret_files
def do_text_formatting(model_name): if model_name not in [ "bert-base-uncased", "bert-base-chinese", "bert-wwm-chinese" ]: logger.error( "The implimented text formattting process only fits" "bert-base-uncased, bert-base-chinese and bert-wwm-chinese." "Preraining model %s you should format the corpus firstly by your own." ) logger.info("=" * 50) logger.info("Start to text formatting.") if model_name == "bert-base-uncased": wiki_formatter = WikicorpusTextFormatter('en', args.output_dir) formatted_files = [wiki_formatter.formatted_file] book_formatter = BookscorpusTextFormatter(args.output_dir) formatted_files.append(book_formatter.formatted_file) else: wiki_formatter = WikicorpusTextFormatter('zh', args.output_dir) formatted_files = wiki_formatter.formatted_file logger.info("End to text formatting") return formatted_files
def _from_pretrained(cls, pretrained_model_name_or_path, task=None, *model_args, **kwargs): if task: if cls._task_choice == True: cls._name_mapping = get_name_mapping(task) else: print('We only support task choice for AutoModel.') all_model_names = [] for pretrained_model_names, model_name in cls._pretrained_model_dict.items( ): for name in pretrained_model_names: all_model_names.append(name) # From built-in pretrained models if pretrained_model_name_or_path in all_model_names: for pretrained_model_names, model_name in cls._pretrained_model_dict.items( ): # From built-in pretrained models for pattern in pretrained_model_names: if pattern == pretrained_model_name_or_path: init_class = cls._name_mapping[model_name + '_Import_Class'] class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.modeling") model_class = getattr(import_class, init_class) return model_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, cls.model_config_file) if os.path.exists(config_file): with io.open(config_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: for model_flag, name in MAPPING_NAMES.items(): if model_flag in init_class: model_name = model_flag + 'Model' break else: # From pretrained_model_name_or_path for model_flag, name in MAPPING_NAMES.items(): if name in pretrained_model_name_or_path.lower(): model_name = model_flag + 'Model' break init_class = cls._name_mapping[model_name + '_Import_Class'] class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.modeling") model_name = getattr(import_class, init_class) return model_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: community_config_path = os.path.join( COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) try: resolved_vocab_file = get_path_from_url( community_config_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load weights for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n" ) if os.path.exists(resolved_vocab_file): with io.open(resolved_vocab_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: for model_flag, name in MAPPING_NAMES.items(): if model_flag in init_class: model_name = model_flag + 'Model' break else: # From pretrained_model_name_or_path for model_flag, name in MAPPING_NAMES.items(): if name in pretrained_model_name_or_path.lower(): model_name = model_flag + 'Model' break init_class = cls._name_mapping[model_name + '_Import_Class'] class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.modeling") model_name = getattr(import_class, init_class) return model_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs)
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Creates an instance of `PretrainedModel`. Model weights are loaded by specifying name of a built-in pretrained model, or a community contributed model, or a local file directory path. Args: pretrained_model_name_or_path (str): Name of pretrained model or dir path to load from. The string can be: - Name of a built-in pretrained model - Name of a community-contributed pretrained model. - Local directory path which contains model weights file("model_state.pdparams") and model config file ("model_config.json"). *args (tuple): Position arguments for model `__init__`. If provided, use these as position argument values for model initialization. **kwargs (dict): Keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for model initialization. If the keyword is in `__init__` argument names of base model, update argument values of the base model; else update argument values of derived model. Returns: PretrainedModel: An instance of `PretrainedModel`. Example: .. code-block:: from paddlenlp.transformers import BertForSequenceClassification # Name of built-in pretrained model model = BertForSequenceClassification.from_pretrained('bert-base-uncased') # Name of community-contributed pretrained model model = BertForSequenceClassification.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') # Load from local directory path model = BertForSequenceClassification.from_pretrained('./my_bert/') """ pretrained_models = list(cls.pretrained_init_configuration.keys()) resource_files = {} init_configuration = {} # From built-in pretrained models if pretrained_model_name_or_path in pretrained_models: for file_id, map_list in cls.pretrained_resource_files_map.items(): resource_files[file_id] = map_list[ pretrained_model_name_or_path] init_configuration = copy.deepcopy( cls. pretrained_init_configuration[pretrained_model_name_or_path]) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( pretrained_model_name_or_path, cls.model_config_file) else: # Assuming from community-contributed pretrained models for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) resolved_resource_files = {} for file_id, file_path in resource_files.items(): if file_path is None or os.path.isfile(file_path): resolved_resource_files[file_id] = file_path continue path = os.path.join(default_root, file_path.split('/')[-1]) if os.path.exists(path): logger.info("Already cached %s" % path) resolved_resource_files[file_id] = path else: logger.info("Downloading %s and saved to %s" % (file_path, default_root)) try: resolved_resource_files[file_id] = get_path_from_url( file_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load weights for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n" ) # Prepare model initialization kwargs # Did we saved some inputs and kwargs to reload ? model_config_file = resolved_resource_files.pop( "model_config_file", None) if model_config_file is not None: with io.open(model_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: init_kwargs = init_configuration # position args are stored in kwargs, maybe better not include init_args = init_kwargs.pop("init_args", ()) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", cls.base_model_class.__name__) # Check if the loaded config matches the current model class's __init__ # arguments. If not match, the loaded config is for the base model class. if init_class == cls.base_model_class.__name__: base_args = init_args base_kwargs = init_kwargs derived_args = () derived_kwargs = {} base_arg_index = None else: # extract config for base model derived_args = list(init_args) derived_kwargs = init_kwargs base_arg = None for i, arg in enumerate(init_args): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = i base_arg = arg break for arg_name, arg in init_kwargs.items(): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = arg_name base_arg = arg break base_args = base_arg.pop("init_args", ()) base_kwargs = base_arg if cls == cls.base_model_class: # Update with newly provided args and kwargs for base model base_args = base_args if not args else args base_kwargs.update(kwargs) model = cls(*base_args, **base_kwargs) else: # Update with newly provided args and kwargs for derived model base_parameters_dict = inspect.signature( cls.base_model_class.__init__).parameters for k, v in kwargs.items(): if k in base_parameters_dict: base_kwargs[k] = v base_model = cls.base_model_class(*base_args, **base_kwargs) if base_arg_index is not None: derived_args[base_arg_index] = base_model else: derived_args = (base_model, ) # assume at the first position derived_args = derived_args if not args else args derived_parameters_dict = inspect.signature( cls.__init__).parameters for k, v in kwargs.items(): if k in derived_parameters_dict: derived_kwargs[k] = v model = cls(*derived_args, **derived_kwargs) # Maybe need more ways to load resources. weight_path = resolved_resource_files["model_state"] assert weight_path.endswith( ".pdparams"), "suffix of weight must be .pdparams" state_dict = paddle.load(weight_path) # Make sure we are able to load base models as well as derived models # (with heads) start_prefix = "" model_to_load = model state_to_load = state_dict unexpected_keys = [] missing_keys = [] if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # base model state_to_load = {} start_prefix = cls.base_model_prefix + "." for k, v in state_dict.items(): if k.startswith(cls.base_model_prefix): state_to_load[k[len(start_prefix):]] = v else: unexpected_keys.append(k) if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # derived model (base model with heads) model_to_load = getattr(model, cls.base_model_prefix) for k in model.state_dict().keys(): if not k.startswith(cls.base_model_prefix): missing_keys.append(k) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if paddle.in_dynamic_mode(): model_to_load.set_state_dict(state_to_load) return model return model, state_to_load
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode model_class, tokenizer_class = MODEL_CLASSES[args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): with paddle.utils.unique_name.guard(): with paddle.static.device_guard('gpu:0'): data_holders = create_data_holder(args) [tokens, loss_mask, attention_mask, position_ids, labels] = data_holders tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) eos_id = tokenizer.eos_token_id train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=eos_id, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, pipeline_mode=False, ) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config[ "hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config["topo"] = topo model = guard(f'gpu:{args.pp_degree -1}')( GPTForPretraining)( guard(f'gpu:0')(GPTModel)(**model_config)) else: model, _ = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args. attention_probs_dropout_prob, topo=topo) # Create the model for the gpt pretrain preds = model(tokens, position_ids, attention_mask) criterion = guard(f'gpu:{args.pp_degree -1}')( GPTPretrainingCriterion)(topo) loss = criterion(preds, labels, loss_mask) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps # TODO @ZHUI Use paddle network to support lr scheduler lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize if args.use_recompute: dist_strategy.recompute = True dist_strategy.recompute_configs = { "checkpoints": model.gpt.checkpoints } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") global_step = 0 tic_train = time.time() epoch = 0 learning_rate = main_program.global_block().vars["learning_rate_0"] while True: fetchs = [] if topo.is_last: fetchs = [loss, learning_rate] # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): global_step += 1 ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_freq == 0: if topo.is_last: loss_return, lr_return = ret speed = args.logging_freq / (time.time() - tic_train) logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, speed: %.2f steps/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, loss_return[0], speed, speed * args.global_batch_size * args.max_seq_len, lr_return[0])) log_writer.add_scalar("loss", loss_return[0], global_step) log_writer.add_scalar("learning_rate", lr_return[0], global_step) tic_train = time.time() if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step == args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step >= args.max_steps: eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "test") del train_data_loader return epoch += 1
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() assert args.dp_degree * args.sharding_degree * args.mp_degree * args.pp_degree == worker_num, \ "The product of degree num should be equal to worker_num." topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) # if os.path.exists(log_writer_path): # shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[ args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) # load config in checkpoint global_step = 0 consumed_samples = 0 checkpoint_dir = os.path.join(args.output_dir, "model_last") if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): with open(os.path.join(checkpoint_dir, "./config.yml"), "r") as f: step_config = yaml.load(f, Loader=yaml.FullLoader) assert step_config[ "global_batch_size"] == args.global_batch_size, "Please ensure checkpoint global batch size is the same. Folder: {}".format( checkpoint_dir) consumed_samples = step_config["consumed_samples"] global_step = step_config["global_step"] data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): data_holders = create_data_holder(args) # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels ] = data_holders tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, tokenizer, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, current_step=global_step) fleet.init(is_collective=True) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] if model_config["vocab_size"] % 8 != 0: model_config["vocab_size"] += 8 - (model_config["vocab_size"] % 8) model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model = model_class(base_class(**model_config)) else: model, _ = model_class.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, ) # Create the model for the gpt pretrain prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, position_ids=None, attention_mask=input_mask, masked_positions=masked_lm_positions) criterion = criterion_class(with_nsp_loss=args.binary_head) if args.binary_head: lm_loss, sop_loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) loss = lm_loss + sop_loss else: loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps # lr_scheduler = CosineAnnealingWithWarmupDecay( # max_lr=args.max_lr, # min_lr=args.min_lr, # warmup_step=args.warmup_rate * args.max_steps, # decay_step=args.decay_steps, last_epoch=global_step) lr_scheduler = LinearDecayWithWarmup(args.max_lr, args.max_steps, args.warmup_rate, last_epoch=global_step) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] logger.info("Using paddle.optimizer.AdamW.") optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize # if args.use_recompute: # dist_strategy.recompute = True # dist_strategy.recompute_configs = { # "checkpoints": model.bert.checkpoints # } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") # load checkpoint vars if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): paddle.static.load(main_program, os.path.join(checkpoint_dir, "static_vars"), exe) fetch_loss_vars = collections.OrderedDict() fetch_other_vars = collections.OrderedDict() fetch_loss_vars["loss"] = loss if args.binary_head: fetch_loss_vars["lm_loss"] = lm_loss fetch_loss_vars["sop_loss"] = sop_loss fetch_other_vars["learning_rate"] = main_program.global_block( ).vars["learning_rate_0"] additional_vars = collections.OrderedDict() if args.use_amp: for key in ["loss_scaling", "num_good_steps", "num_bad_steps"]: additional_vars[key] = main_program.global_block().vars[key + "_0"] tic_train = time.time() while True: fetchs = [] fetchs_keys = [] if topo.is_last: fetchs = list(fetch_loss_vars.values()) + list( fetch_other_vars.values()) + list(additional_vars.values()) fetchs_keys = list(fetch_loss_vars.keys()) + list( fetch_other_vars.keys()) + list(additional_vars.keys()) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # Skip for accumulate_steps in global step if (step + 1) % args.accumulate_steps != 0: continue global_step += 1 # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_freq == 0: if topo.is_last: res = collections.defaultdict(float) for k, v in zip(fetchs_keys, ret): res[k] = v[0] speed = args.logging_freq / (time.time() - tic_train) loss_info = "loss: %.6f, lm_loss: %.6f, sop_loss: %.6f" loss_info = ", ".join([ "{}: {:.6f}".format(k, res[k]) for k in fetch_loss_vars.keys() ]) common_loginfo = "global step %d, %s, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % ( global_step, loss_info, speed, speed * args.global_batch_size, res["learning_rate"]) additional_loginfo = ", ".join([ "{}: {}".format(k, res[k]) for k in additional_vars.keys() ]) if additional_loginfo: common_loginfo += ", " + additional_loginfo logger.info(common_loginfo) for k, v in res.items(): log_writer.add_scalar(k, v, global_step) tic_train = time.time() #if args.check_accuracy: # if global_step >= args.max_steps: # return # else: # continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = collections.OrderedDict() if topo.is_last: eval_fetch["loss"] = loss if args.binary_head: eval_fetch["lm_loss"] = lm_loss eval_fetch["sop_loss"] = sop_loss run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step == args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step % args.checkpoint_steps == 0: output_dir = os.path.join(args.output_dir, "model_last") if worker_index == 0: if not os.path.exists(output_dir): os.mkdir(output_dir) output_dir_bak = os.path.join(args.output_dir, "model_last_bak") if os.path.exists(output_dir): if os.path.exists(output_dir_bak): shutil.rmtree(output_dir_bak) shutil.move(output_dir, output_dir_bak) os.mkdir(output_dir) step_config = { "model_name": args.model_name_or_path, "global_step": global_step, "global_batch_size": args.global_batch_size, "consumed_samples": global_step * args.global_batch_size, } with open(os.path.join(output_dir, "config.yml"), "w") as f: yaml.dump(step_config, f, encoding='utf-8', allow_unicode=True) fleet.barrier_worker() logger.debug("saving models to {}".format(output_dir)) if args.sharding_degree <= 1: # Save on the first worker by default. if worker_index == 0: paddle.static.save( main_program, os.path.join(output_dir, "static_vars")) else: # Use save_persistables in sharding, but more slower save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step >= args.max_steps: eval_fetch = collections.OrderedDict() if topo.is_last: eval_fetch["loss"] = loss if args.binary_head: eval_fetch["lm_loss"] = lm_loss eval_fetch["sop_loss"] = sop_loss run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, topo.is_last, eval_fetch, "test") del train_data_loader return
def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): """ Creates an instance of `PretrainedModel`. Model weights are loaded by specifying name of a built-in pretrained model, or a community contributed model, or a local file directory path. Args: pretrained_model_name_or_path (str): Name of pretrained model or dir path to load from. The string can be: - Name of a built-in pretrained model - Name of a community-contributed pretrained model. - Local directory path which contains model weights file("model_state.pdparams") and model config file ("model_config.json"). *args (tuple): Position arguments for model `__init__`. If provided, use these as position argument values for model initialization. **kwargs (dict): Keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for model initialization. If the keyword is in `__init__` argument names of base model, update argument values of the base model; else update argument values of derived model. load_state_as_np (bool, optional): The weights read in can be choosed to place on CPU or GPU though the model is on the default device. If `True`, load the model weights as `numpy.ndarray` on CPU. Otherwise, weights would be loaded as tensors on the default device. Note that if on GPU, the latter would creates extra temporary tensors in addition to the model weights, which doubles the memory usage . Thus it is suggested to use `True` for big models on GPU. Default to `False`. Returns: PretrainedModel: An instance of `PretrainedModel`. Example: .. code-block:: from paddlenlp.transformers import BertForSequenceClassification # Name of built-in pretrained model model = BertForSequenceClassification.from_pretrained('bert-base-uncased') # Name of community-contributed pretrained model model = BertForSequenceClassification.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') # Load from local directory path model = BertForSequenceClassification.from_pretrained('./my_bert/') """ pretrained_models = list(cls.pretrained_init_configuration.keys()) resource_files = {} init_configuration = {} load_state_as_np = kwargs.pop("load_state_as_np", False) # From built-in pretrained models if pretrained_model_name_or_path in pretrained_models: for file_id, map_list in cls.pretrained_resource_files_map.items(): resource_files[file_id] = map_list[ pretrained_model_name_or_path] init_configuration = copy.deepcopy( cls. pretrained_init_configuration[pretrained_model_name_or_path]) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( pretrained_model_name_or_path, cls.model_config_file) else: # Assuming from community-contributed pretrained models for file_id, file_name in cls.resource_files_names.items(): full_file_name = os.path.join(COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, file_name) resource_files[file_id] = full_file_name resource_files["model_config_file"] = os.path.join( COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.model_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) resolved_resource_files = {} for file_id, file_path in resource_files.items(): if file_path is None or os.path.isfile(file_path): resolved_resource_files[file_id] = file_path continue path = os.path.join(default_root, file_path.split('/')[-1]) if os.path.exists(path): logger.info("Already cached %s" % path) resolved_resource_files[file_id] = path else: logger.info("Downloading %s and saved to %s" % (file_path, default_root)) try: resolved_resource_files[file_id] = get_path_from_url( file_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load weights for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant modeling files(model_weights and model_config).\n" ) # Prepare model initialization kwargs # Did we saved some inputs and kwargs to reload ? model_config_file = resolved_resource_files.pop( "model_config_file", None) if model_config_file is not None: with io.open(model_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) else: init_kwargs = init_configuration # position args are stored in kwargs, maybe better not include init_args = init_kwargs.pop("init_args", ()) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", cls.base_model_class.__name__) # Check if the loaded config matches the current model class's __init__ # arguments. If not match, the loaded config is for the base model class. if init_class == cls.base_model_class.__name__: base_args = init_args base_kwargs = init_kwargs derived_args = () derived_kwargs = {} base_arg_index = None else: # extract config for base model derived_args = list(init_args) derived_kwargs = init_kwargs base_arg = None for i, arg in enumerate(init_args): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = i base_arg = arg break for arg_name, arg in init_kwargs.items(): if isinstance(arg, dict) and "init_class" in arg: assert arg.pop( "init_class") == cls.base_model_class.__name__, ( "pretrained base model should be {}").format( cls.base_model_class.__name__) base_arg_index = arg_name base_arg = arg break base_args = base_arg.pop("init_args", ()) base_kwargs = base_arg if cls == cls.base_model_class: # Update with newly provided args and kwargs for base model base_args = base_args if not args else args base_kwargs.update(kwargs) model = cls(*base_args, **base_kwargs) else: # Update with newly provided args and kwargs for derived model base_parameters_dict = inspect.signature( cls.base_model_class.__init__).parameters for k, v in kwargs.items(): if k in base_parameters_dict: base_kwargs[k] = v base_model = cls.base_model_class(*base_args, **base_kwargs) if base_arg_index is not None: derived_args[base_arg_index] = base_model else: derived_args = (base_model, ) # assume at the first position derived_args = derived_args if not args else args derived_parameters_dict = inspect.signature( cls.__init__).parameters for k, v in kwargs.items(): if k in derived_parameters_dict: derived_kwargs[k] = v model = cls(*derived_args, **derived_kwargs) # Maybe need more ways to load resources. weight_path = resolved_resource_files["model_state"] assert weight_path.endswith( ".pdparams"), "suffix of weight must be .pdparams" # NOTE: Allow to load partial model for model parallel. # TODO(guosheng): To make model loading for the model parallel automatic, # maybe we should make rank 0 worker load weights of the full model on # CPU, then split weights into multiple parts and pickle separately. # The other workers wait util pickle finish and then load the corresponding # partial weights. Also we can directly use separate weight files for # simplicity. state_dict = paddle.load(weight_path, return_numpy=load_state_as_np) # Make sure we are able to load base models as well as derived models # (with heads) start_prefix = "" model_to_load = model state_to_load = state_dict unexpected_keys = [] missing_keys = [] if not hasattr(model, cls.base_model_prefix) and any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # base model state_to_load = {} start_prefix = cls.base_model_prefix + "." for k, v in state_dict.items(): if k.startswith(cls.base_model_prefix): state_to_load[k[len(start_prefix):]] = v else: unexpected_keys.append(k) if hasattr(model, cls.base_model_prefix) and not any( s.startswith(cls.base_model_prefix) for s in state_dict.keys()): # derived model (base model with heads) model_to_load = getattr(model, cls.base_model_prefix) for k in model.state_dict().keys(): if not k.startswith(cls.base_model_prefix): missing_keys.append(k) if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) # Allow the float16 model to load float32 weights, which decreases memory # usage in model loading stage and is useful to big models. dtype_prefix_len = len("paddle.") # paddle.float16 for k, v in model_to_load.state_dict().items(): if not isinstance(v, np.ndarray): dtype = str(v.dtype)[dtype_prefix_len:] # TODO(guosheng): add warnings for unmatched dtypes if k in state_to_load: state_to_load[k] = state_to_load[k].astype(dtype) # Logging model download statistics download_check(pretrained_model_name_or_path, "from_pretrained") # For model parallel if FasterGeneration # To avoid recursive import temporarily. import paddlenlp.ops.faster_transformer.transformer.decoding as ft_decoding state_to_load = ft_decoding.get_ft_para_conf().fit_partial_model( model_to_load, state_to_load) if paddle.in_dynamic_mode(): model_to_load.set_state_dict(state_to_load) return model return model, state_to_load
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): """ Creates an instance of `AutoTokenizer`. Related resources are loaded by specifying name of a built-in pretrained model, or a community-contributed pretrained model, or a local file directory path. Args: pretrained_model_name_or_path (str): Name of pretrained model or dir path to load from. The string can be: - Name of built-in pretrained model - Name of a community-contributed pretrained model. - Local directory path which contains tokenizer related resources and tokenizer config file ("tokenizer_config.json"). *args (tuple): position arguments for model `__init__`. If provided, use these as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for tokenizer initialization. Returns: PretrainedTokenizer: An instance of `PretrainedTokenizer`. Example: .. code-block:: from paddlenlp.transformers import AutoTokenizer # Name of built-in pretrained model tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> # Name of community-contributed pretrained model tokenizer = AutoTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> # Load from local directory path tokenizer = AutoTokenizer.from_pretrained('./my_bert/') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> """ all_tokenizer_names = [] for names, tokenizer_class in cls._tokenizer_mapping.items(): for name in names: all_tokenizer_names.append(name) # From built-in pretrained models if pretrained_model_name_or_path in all_tokenizer_names: for names, tokenizer_class in cls._tokenizer_mapping.items(): for pattern in names: if pattern == pretrained_model_name_or_path: return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file) if os.path.exists(config_file): with io.open(config_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_name = getattr(import_class, init_class) return tokenizer_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # If no `init_class`, we use pattern recognition to recognize the tokenizer class. else: print( 'We use pattern recognition to recognize the Tokenizer class.' ) for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): init_class = key class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_name = getattr(import_class, init_class) print( f"The 'pretrained_model_name_or_path' is {pretrained_model_name_or_path}, we import {tokenizer_name}." ) return tokenizer_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: community_config_path = os.path.join(COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.tokenizer_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) try: resolved_vocab_file = get_path_from_url(community_config_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant tokenizer files.\n" ) if os.path.exists(resolved_vocab_file): with io.open(resolved_vocab_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_name = getattr(import_class, init_class) return tokenizer_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # If no `init_class`, we use pattern recognition to recognize the Tokenizer class. else: print( 'We use pattern recognition to recognize the Tokenizer class.' ) for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): init_class = key class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_name = getattr(import_class, init_class) print( f"The 'pretrained_model_name_or_path' is {pretrained_model_name_or_path}, we import {tokenizer_name}." ) return tokenizer_name.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs)
def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): """ Creates an instance of `AutoTokenizer`. Related resources are loaded by specifying name of a built-in pretrained model, or a community-contributed pretrained model, or a local file directory path. Args: pretrained_model_name_or_path (str): Name of pretrained model or dir path to load from. The string can be: - Name of built-in pretrained model - Name of a community-contributed pretrained model. - Local directory path which contains tokenizer related resources and tokenizer config file ("tokenizer_config.json"). *args (tuple): position arguments for model `__init__`. If provided, use these as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for tokenizer initialization. Returns: PretrainedTokenizer: An instance of `PretrainedTokenizer`. Example: .. code-block:: from paddlenlp.transformers import AutoTokenizer # Name of built-in pretrained model tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> # Name of community-contributed pretrained model tokenizer = AutoTokenizer.from_pretrained('yingyibiao/bert-base-uncased-sst-2-finetuned') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> # Load from local directory path tokenizer = AutoTokenizer.from_pretrained('./my_bert/') print(type(tokenizer)) # <class 'paddlenlp.transformers.bert.tokenizer.BertTokenizer'> """ # default not to use faster tokenizer use_faster = kwargs.pop("use_faster", False) all_tokenizer_names = [] for names, tokenizer_class in cls._tokenizer_mapping.items(): for name in names: all_tokenizer_names.append(name) # From built-in pretrained models if pretrained_model_name_or_path in all_tokenizer_names: for names, tokenizer_classes in cls._tokenizer_mapping.items(): for pattern in names: if pattern == pretrained_model_name_or_path: actual_tokenizer_class = None # Default setting the python tokenizer to actual_tokenizer_class for tokenizer_class in tokenizer_classes: if not tokenizer_class[1]: actual_tokenizer_class = tokenizer_class[0] break if use_faster: if is_faster_tokenizers_available(): is_support_faster_tokenizer = False for tokenizer_class in tokenizer_classes: if tokenizer_class[1]: actual_tokenizer_class = tokenizer_class[ 0] is_support_faster_tokenizer = True break if not is_support_faster_tokenizer: logger.warning( f"The tokenizer {actual_tokenizer_class} doesn't have the faster version." " Please check the map `paddlenlp.transformers.auto.tokenizer.FASTER_TOKENIZER_MAPPING_NAMES`" " to see which faster tokenizers are currently supported." ) else: logger.warning( "Can't find the faster_tokenizers package, " "please ensure install faster_tokenizers correctly. " "You can install faster_tokenizers by `pip install faster_tokenizers`" "(Currently only work for linux platform)." ) logger.info("We are using %s to load '%s'." % (actual_tokenizer_class, pretrained_model_name_or_path)) return actual_tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # From local dir path elif os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, cls.tokenizer_config_file) if os.path.exists(config_file): with io.open(config_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class is None: init_class = init_kwargs.pop("tokenizer_class", None) if init_class: class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_class = getattr(import_class, init_class) logger.info( "We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # If no `init_class`, we use pattern recognition to recognize the tokenizer class. else: print( 'We use pattern recognition to recognize the Tokenizer class.' ) for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): init_class = key class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer" ) tokenizer_class = getattr(import_class, init_class) logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # Assuming from community-contributed pretrained models else: community_config_path = os.path.join( COMMUNITY_MODEL_PREFIX, pretrained_model_name_or_path, cls.tokenizer_config_file) default_root = os.path.join(MODEL_HOME, pretrained_model_name_or_path) try: resolved_vocab_file = get_path_from_url( community_config_path, default_root) except RuntimeError as err: logger.error(err) raise RuntimeError( f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" f"Please make sure that '{pretrained_model_name_or_path}' is:\n" "- a correct model-identifier of built-in pretrained models,\n" "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant tokenizer files.\n" ) if os.path.exists(resolved_vocab_file): with io.open(resolved_vocab_file, encoding="utf-8") as f: init_kwargs = json.load(f) # class name corresponds to this configuration init_class = init_kwargs.pop("init_class", None) if init_class: class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer") tokenizer_class = getattr(import_class, init_class) logger.info( "We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs) # If no `init_class`, we use pattern recognition to recognize the Tokenizer class. else: print( 'We use pattern recognition to recognize the Tokenizer class.' ) for key, pattern in cls._name_mapping.items(): if pattern in pretrained_model_name_or_path.lower(): init_class = key class_name = cls._name_mapping[init_class] import_class = importlib.import_module( f"paddlenlp.transformers.{class_name}.tokenizer" ) tokenizer_class = getattr(import_class, init_class) logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs)
def do_generation(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() assert args.dp_degree == 1, "Data parallel is not supported in inference" assert args.sharding_degree == 1, "Sharding parallel is temporarily not supported in inference" assert args.pp_degree == 1, "Pipeline parallel will be supported later" if args.mp_degree == 1: args.mp_degree = paddle.distributed.get_world_size() else: assert args.mp_degree == paddle.distributed.get_world_size(), \ "If mp_degree is specified, the size must be the same as world_size" strategy = fleet.DistributedStrategy() strategy.tensor_parallel = True strategy.tensor_parallel_configs = { "tensor_parallel_degree": args.mp_degree } fleet.init(is_collective=True, strategy=strategy) # temp use dynamic init, use HybridParallelInferenceHelper in future? paddle.distributed.init_parallel_env() # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) if args.use_amp and args.amp_level == "O2": assert (args.mp_degree == 1 and args.pp_degree == 1 ), "When amp level is O2, mp_degree and pp_degree should be 1." assert (args.use_sharding == False ), "When amp level is O2, use_sharding should be False." assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank()) topo = Topology( device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) model_class, tokenizer_class = MODEL_CLASSES[args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) data_file = get_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): with paddle.utils.unique_name.guard(): with paddle.static.device_guard('gpu:0'): feeds = create_data_holder(args) tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) eos_id = tokenizer.eos_token_id _, _, test_data_loader = create_pretrained_dataset( args, data_file, local_rank=local_rank, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=eos_id, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=feeds, pipeline_mode=False) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config[ "hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config["topo"] = topo model_config["fuse"] = args.fuse model = GPTForGeneration( GPTModel(**model_config), max_length=args.max_dec_len, decoding_strategy=args.decoding_strategy, temperature=args.temperature, top_k=args.topk, top_p=args.topp, eos_id=eos_id, fuse=args.fuse) else: logger.error("No checkpoint load.") model.eval() ins = {v.name: v for v in feeds} preds = model(ins) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) main_program = main_program.clone(for_test=True) model_urls = model.pretrained_resource_files_map['model_state'] model_path = args.model_name_or_path if model_path in pretrained_models_list and model_path in model_urls: flag_loaded = False from paddle.utils.download import get_weights_path_from_url dygraph_path = get_weights_path_from_url(model_urls[model_path]) if os.path.exists(dygraph_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygraph_path) init_static_with_params( model, paddle.load( dygraph_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") global_step = 0 epoch = 0 fetchs = [preds] ### check resutls text = [ "Question: Where is the capital of China? Answer:", "Question:Who is the CEO of Apple? Answer:" ] inputs = tokenizer( text, padding=True, return_attention_mask=True, return_position_ids=True) ids = np.array(inputs["input_ids"]).reshape(len(text), -1).astype('int64') position_ids = np.array(inputs["position_ids"]).reshape(len(text), -1).astype('int64') attention_mask = np.array(inputs["attention_mask"]).reshape( len(text), -1).astype('float32') t_ids = paddle.fluid.core.Tensor() t_ids.set(ids, place) t_mask = paddle.fluid.core.Tensor() t_mask.set(attention_mask, place) t_pos = paddle.fluid.core.Tensor() t_pos.set(position_ids, place) feed_data = {'src_ids': t_ids, 'pos_ids': t_pos, 'input_mask': t_mask} ret = exe.run(main_program, feed=feed_data, fetch_list=fetchs) ret = np.array(ret[0]) for i in range(ret.shape[0]): o = [int(x) for x in ret[i]] ret_str = tokenizer.convert_ids_to_string(o) ret_str = text[i] + ret_str logger.info(ret_str) ################## for step, batch in enumerate(test_data_loader()): ret = exe.run(main_program, feed=batch, fetch_list=fetchs) if step == 5: break if args.save_inference_model_then_exist: save_inference_model_dir = 'inference_model_pp{pp_degree}mp{mp_degree}'.format( pp_degree=args.pp_degree, mp_degree=args.mp_degree) inference_save_path = os.path.join(save_inference_model_dir, 'rank_' + str(fleet.worker_index()), 'step_' + str(0)) print("saving inference models to {}".format(inference_save_path)) feed_names = [v.name for v in feeds] fetchs_names = [v.name for v in fetchs] print('feeds: ', feed_names, 'fetches: ', fetchs_names) paddle.static.save_inference_model( inference_save_path, feeds, fetchs, exe, program=main_program)