require_torch_gpu, require_torch_multi_gpu, slow, ) from transformers.trainer_utils import set_seed bindir = os.path.abspath(os.path.dirname(__file__)) with ExtendSysPath(f"{bindir}/.."): from test_trainer import TrainerIntegrationCommon # noqa if is_torch_available(): from test_trainer import get_regression_trainer # noqa set_seed(42) MBART_TINY = "sshleifer/tiny-mbart" T5_SMALL = "t5-small" def load_json(path): with open(path) as f: return json.load(f) # a candidate for testing_utils def require_deepspeed(test_case): """ Decorator marking a test that requires deepspeed """ if not is_deepspeed_available():
def __init__( self, args: TFTrainingArguments, model, num_train_examples, total_train_batch_size: Optional[int] = 8, train_dataset: Optional[tf.data.Dataset] = None, eval_dataset: Optional[tf.data.Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, tb_writer: Optional[tf.summary.SummaryWriter] = None, optimizers: Tuple[ tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule] = ( None, None, ), **kwargs, ): assert parse(tf.__version__).release >= (2, 2, 0), ( "You need to run the TensorFlow trainer with at least the version 2.2.0, your version is %r " % tf.__version__) self.num_train_examples = num_train_examples self.total_train_batch_size = total_train_batch_size self.model = model self.args = args self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.optimizer, self.lr_scheduler = optimizers self.gradient_accumulator = GradientAccumulator() self.global_step = 0 self.epoch_logging = 0 if "prediction_loss_only" in kwargs: warnings.warn( "Passing `prediction_loss_only` as a keyword argument is deprecated and won't be possible in a future version. Use `args.prediction_loss_only` instead.", FutureWarning, ) self.args.prediction_loss_only = kwargs.pop("prediction_loss_only") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." # if tb_writer is not None: # self.tb_writer = tb_writer # else: # self.tb_writer = tf.summary.create_file_writer(self.args.logging_dir) # if is_wandb_available(): # self.setup_wandb() # elif os.environ.get("WANDB_DISABLED") != "true": # logger.info( # "You are instantiating a Trainer but W&B is not installed. To use wandb logging, " # "run `pip install wandb; wandb login` see https://docs.wandb.com/huggingface." # ) # if is_comet_available(): # self.setup_comet() # elif os.environ.get("COMET_MODE") != "DISABLED": # logger.info( # "To use comet_ml logging, run `pip/conda install comet_ml` " # "see https://www.comet.ml/docs/python-sdk/huggingface/" # ) set_seed(self.args.seed)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty." "Use --overwrite_output_dir to overcome.") # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, ) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub # # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this # behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in datasets.keys(): datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" datasets = load_dataset(extension, data_files=data_files) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. # if model_args.config_name: # config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir) # elif model_args.model_name_or_path: # config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir) # else: # config = CONFIG_MAPPING[model_args.model_type]() # logger.warning("You are instantiating a new config instance from scratch.") pretrained_name = "bert-base-uncased" config = BertConfig.from_pretrained(pretrained_name) tokenizer = BertTokenizer.from_pretrained(pretrained_name) # if model_args.model_name_or_path: # model = AutoModelForMaskedLM.from_pretrained( # model_args.model_name_or_path, # from_tf=bool(".ckpt" in model_args.model_name_or_path), # config=config, # cache_dir=model_args.cache_dir, # ) # else: # logger.info("Training new model from scratch") # model = AutoModelForMaskedLM.from_config(config) model = LitOutputBertModel(name="test", config=config, tokenizer=tokenizer, pretrained_name=pretrained_name) model.bert.train() print("Requiring grads for bert") for param in model.bert.parameters(recurse=True): param.requires_grad = True for param in model.training_bert.parameters(recurse=True): param.requires_grad = True model.bert.resize_token_embeddings(len(tokenizer)) # Preprocessing the datasets. # First we tokenize all the texts. if training_args.do_train: column_names = datasets["train"].column_names else: column_names = datasets["validation"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.line_by_line: # When using line_by_line, we just tokenize each nonempty line. padding = "max_length" if data_args.pad_to_max_length else False def tokenize_function(examples): # Remove empty lines examples["text"] = [ line for line in examples["text"] if len(line) > 0 and not line.isspace() ] return tokenizer( examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length, # We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it # receives the `special_tokens_mask`. return_special_tokens_mask=True, ) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=[text_column_name], load_from_cache_file=not data_args.overwrite_cache, ) else: # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts. # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more # efficient when it receives the `special_tokens_mask`. def tokenize_function(examples): return tokenizer(examples[text_column_name], return_special_tokens_mask=True) tokenized_datasets = datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of # max_seq_length. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. total_length = (total_length // max_seq_length) * max_seq_length # Split by chunks of max_len. result = { k: [ t[i:i + max_seq_length] for i in range(0, total_length, max_seq_length) ] for k, t in concatenated_examples.items() } return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value # might be slower to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map tokenized_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, ) # Data collator # This one will take care of randomly masking the tokens. tokenized_datasets["train"]._output_all_columns = True data_collator = CustomDataCollatorForLanguageModeling( tokenizer=tokenizer, mlm_probability=data_args.mlm_probability) # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=tokenized_datasets["train"] if training_args.do_train else None, eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: model_path = (model_args.model_name_or_path if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) else None) trainer.train(model_path=model_path) trainer.save_model() # Saves the tokenizer too for easy upload # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() perplexity = math.exp(eval_output["eval_loss"]) results["perplexity"] = perplexity output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in results.items(): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") return results
import typing as t from typing import TYPE_CHECKING import pytest import requests import transformers.pipelines from transformers.trainer_utils import set_seed import bentoml if TYPE_CHECKING: from bentoml._internal.external_typing import transformers as ext set_seed(123) MODEL_ID = "julien-c/dummy-unknown" REVISION_ID_INVALID = "e10" model_name = "gpt2" test_sentence = {"text": "A Bento box is a "} batched_sentence = [ "I love you and I want to spend my whole life with you", "I hate you, Lyon, you broke my heart.", ] result = ( "A Bento box is a urn that is used to store the contents of a Bento box. " + "It is usually used to store the contents of a Bento box in a storage container." + "\n\nThe Bento box is a small, rectangular")