def test_encoder_decoder_from_pretrained(self): load_weight_prefix = TFEncoderDecoderModel.load_weight_prefix config = self.get_encoder_decoder_config() encoder_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") decoder_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") input_ids = encoder_tokenizer("who sings does he love me with reba", return_tensors="tf").input_ids decoder_input_ids = decoder_tokenizer("Linda Davis", return_tensors="tf").input_ids with tempfile.TemporaryDirectory() as tmp_dirname: # Since most of HF's models don't have pretrained cross-attention layers, they are randomly # initialized even if we create models using `from_pretrained` method. # For the tests, the decoder need to be a model with pretrained cross-attention layers. # So we create pretrained models (without `load_weight_prefix`), save them, and later, # we load them using `from_pretrained`. # (we don't need to do this for encoder, but let's make the code more similar between encoder/decoder) encoder = TFAutoModel.from_pretrained("bert-base-uncased", name="encoder") # It's necessary to specify `add_cross_attention=True` here. decoder = TFAutoModelForCausalLM.from_pretrained( "bert-base-uncased", is_decoder=True, add_cross_attention=True, name="decoder" ) pretrained_encoder_dir = os.path.join(tmp_dirname, "pretrained_encoder") pretrained_decoder_dir = os.path.join(tmp_dirname, "pretrained_decoder") encoder.save_pretrained(pretrained_encoder_dir) decoder.save_pretrained(pretrained_decoder_dir) del encoder del decoder enc_dec_model = TFEncoderDecoderModel.from_encoder_decoder_pretrained( pretrained_encoder_dir, pretrained_decoder_dir, ) # check that the from pretrained methods work enc_dec_model.save_pretrained(tmp_dirname) enc_dec_model = TFEncoderDecoderModel.from_pretrained(tmp_dirname) output = enc_dec_model(input_ids, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) loss_pretrained = output.loss del enc_dec_model # Create the model using `__init__` with loaded ``pretrained`` encoder / decoder encoder = TFAutoModel.from_pretrained( pretrained_encoder_dir, load_weight_prefix=load_weight_prefix, name="encoder" ) decoder = TFAutoModelForCausalLM.from_pretrained( pretrained_decoder_dir, load_weight_prefix=load_weight_prefix, name="decoder" ) enc_dec_model = TFEncoderDecoderModel(config=config, encoder=encoder, decoder=decoder) output = enc_dec_model(input_ids, decoder_input_ids=decoder_input_ids, labels=decoder_input_ids) loss_init = output.loss max_diff = np.max(np.abs(loss_pretrained - loss_init)) expected_diff = 0.0 self.assertAlmostEqual(max_diff, expected_diff, places=4)
def test_model_for_causal_lm(self): for model_name in TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, GPT2Config) model = TFAutoModelForCausalLM.from_pretrained(model_name) model, loading_info = TFAutoModelForCausalLM.from_pretrained(model_name, output_loading_info=True) self.assertIsNotNone(model) self.assertIsInstance(model, TFGPT2LMHeadModel)
def test_small_model_tf(self): tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small") model = TFAutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small") conversation_agent = ConversationalPipeline(model=model, tokenizer=tokenizer) conversation = Conversation("hello") output = conversation_agent(conversation) self.assertEqual(output, Conversation(past_user_inputs=["hello"], generated_responses=["Hi"]))
def __init__( self, tokenizer_name: str, model_name: str, utterance_window: int = 4, ): self.tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_name) self.model = TFAutoModelForCausalLM.from_pretrained(model_name, from_pt=True)
def main(): # region Argument Parsing parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sanity checks if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None: raise ValueError( "Need either a dataset name or a training/validation file.") else: if data_args.train_file is not None: extension = data_args.train_file.split(".")[-1] assert extension in [ "csv", "json", "txt" ], "`train_file` should be a csv, json or txt file." if data_args.validation_file is not None: extension = data_args.validation_file.split(".")[-1] assert extension in [ "csv", "json", "txt" ], "`validation_file` should be a csv, json or txt file." if training_args.output_dir is not None: training_args.output_dir = Path(training_args.output_dir) os.makedirs(training_args.output_dir, exist_ok=True) if isinstance( training_args.strategy, tf.distribute.TPUStrategy) and not data_args.pad_to_max_length: logger.warning("We are training on TPU - forcing pad_to_max_length") data_args.pad_to_max_length = True # endregion # region Checkpoints # Detecting last checkpoint. checkpoint = None if len(os.listdir(training_args.output_dir) ) > 0 and not training_args.overwrite_output_dir: config_path = training_args.output_dir / CONFIG_NAME weights_path = training_args.output_dir / TF2_WEIGHTS_NAME if config_path.is_file() and weights_path.is_file(): checkpoint = training_args.output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless.") # endregion # region Setup logging # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # endregion # If passed along, set the training seed now. if training_args.seed is not None: set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", ) else: data_files = {} dataset_args = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.train_file.split(".")[-1] if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks raw_datasets = load_dataset(extension, data_files=data_files, **dataset_args) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning( "You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) # endregion # region Dataset preprocessing # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] if data_args.max_seq_length is None: max_seq_length = tokenizer.model_max_length if max_seq_length > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --max_seq_length xxx." ) max_seq_length = 1024 else: if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can reduce that value by passing --block_size xxx." ) block_size = 1024 # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = { k: sum(examples[k], []) for k in examples.keys() } total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= block_size: total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i:i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc=f"Grouping texts in chunks of {block_size}", ) train_dataset = lm_datasets["train"] if data_args.validation_file is not None: eval_dataset = lm_datasets["validation"] else: logger.info( f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation as provided in data_args" ) train_indices, val_indices = train_test_split( list(range(len(train_dataset))), test_size=data_args.validation_split_percentage / 100) eval_dataset = train_dataset.select(val_indices) train_dataset = train_dataset.select(train_indices) if data_args.max_train_samples is not None: train_dataset = train_dataset.select(range( data_args.max_train_samples)) if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # endregion with training_args.strategy.scope(): # region Prepare model if checkpoint is not None: model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config) elif model_args.model_name_or_path: model = TFAutoModelForCausalLM.from_pretrained( model_args.model_name_or_path, config=config) else: logger.info("Training new model from scratch") model = TFAutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # endregion # region TF Dataset preparation num_replicas = training_args.strategy.num_replicas_in_sync train_generator = partial(sample_generator, train_dataset, tokenizer) train_signature = { feature: tf.TensorSpec(shape=(None, ), dtype=tf.int64) for feature in train_dataset.features if feature != "special_tokens_mask" } train_sig = (train_signature, train_signature["labels"]) options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF tf_train_dataset = (tf.data.Dataset.from_generator( train_generator, output_signature=train_sig).with_options(options).batch( batch_size=num_replicas * training_args.per_device_train_batch_size, drop_remainder=True).repeat(int( training_args.num_train_epochs))) eval_generator = partial(sample_generator, eval_dataset, tokenizer) eval_signature = { feature: tf.TensorSpec(shape=(None, ), dtype=tf.int64) for feature in eval_dataset.features if feature != "special_tokens_mask" } eval_sig = (eval_signature, eval_signature["labels"]) tf_eval_dataset = (tf.data.Dataset.from_generator( eval_generator, output_signature=eval_sig).with_options(options).batch( batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True).repeat(int( training_args.num_train_epochs))) # endregion # region Optimizer and loss batches_per_epoch = len(train_dataset) // ( num_replicas * training_args.per_device_train_batch_size) # Bias and layernorm weights are automatically excluded from the decay optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=int(training_args.num_train_epochs * batches_per_epoch), num_warmup_steps=training_args.warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, ) def dummy_loss(y_true, y_pred): return tf.reduce_mean(y_pred) model.compile(optimizer=optimizer, loss={"loss": dummy_loss}) # endregion # region Training and validation logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info( f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}" ) logger.info( f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}" ) history = model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), steps_per_epoch=len(train_dataset) // (training_args.per_device_train_batch_size * num_replicas), callbacks=[ SavePretrainedCallback(output_dir=training_args.output_dir) ], ) try: train_perplexity = math.exp(history.history["loss"][-1]) except OverflowError: train_perplexity = math.inf try: validation_perplexity = math.exp(history.history["val_loss"][-1]) except OverflowError: validation_perplexity = math.inf logger.info(f" Final train loss: {history.history['loss'][-1]:.3f}") logger.info(f" Final train perplexity: {train_perplexity:.3f}") logger.info( f" Final validation loss: {history.history['val_loss'][-1]:.3f}") logger.info( f" Final validation perplexity: {validation_perplexity:.3f}") # endregion if training_args.output_dir is not None: model.save_pretrained(training_args.output_dir) if training_args.push_to_hub: # You'll probably want to include some of your own metadata here! model.push_to_hub()
def main(): # region Argument Parsing parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_clm", model_args, data_args, framework="tensorflow") # Sanity checks if data_args.dataset_name is None and data_args.train_file is None and data_args.validation_file is None: raise ValueError("Need either a dataset name or a training/validation file.") else: if data_args.train_file is not None: extension = data_args.train_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, json or txt file." if data_args.validation_file is not None: extension = data_args.validation_file.split(".")[-1] assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, json or txt file." if training_args.output_dir is not None: training_args.output_dir = Path(training_args.output_dir) os.makedirs(training_args.output_dir, exist_ok=True) # endregion # region Checkpoints # Detecting last checkpoint. checkpoint = None if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: config_path = training_args.output_dir / CONFIG_NAME weights_path = training_args.output_dir / TF2_WEIGHTS_NAME if config_path.is_file() and weights_path.is_file(): checkpoint = training_args.output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless." ) # endregion # region Setup logging # accelerator.is_local_main_process is only True for one process per machine. logger.setLevel(logging.INFO) datasets.utils.logging.set_verbosity_warning() transformers.utils.logging.set_verbosity_info() # endregion # If passed along, set the training seed now. if training_args.seed is not None: set_seed(training_args.seed) # region Load datasets # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. raw_datasets = load_dataset( data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) raw_datasets["train"] = load_dataset( data_args.dataset_name, data_args.dataset_config_name, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: data_files = {} dataset_args = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = ( data_args.train_file.split(".")[-1] if data_args.train_file is not None else data_args.validation_file.split(".")[-1] ) if extension == "txt": extension = "text" dataset_args["keep_linebreaks"] = data_args.keep_linebreaks raw_datasets = load_dataset( extension, data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) # If no validation data is there, validation_split_percentage will be used to divide the dataset. if "validation" not in raw_datasets.keys(): raw_datasets["validation"] = load_dataset( extension, data_files=data_files, split=f"train[:{data_args.validation_split_percentage}%]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) raw_datasets["train"] = load_dataset( extension, data_files=data_files, split=f"train[{data_args.validation_split_percentage}%:]", cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, **dataset_args, ) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if model_args.config_name: config = AutoConfig.from_pretrained(model_args.config_name) elif model_args.model_name_or_path: config = AutoConfig.from_pretrained(model_args.model_name_or_path) else: config = CONFIG_MAPPING[model_args.model_type]() logger.warning("You are instantiating a new config instance from scratch.") if model_args.tokenizer_name: tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name) elif model_args.model_name_or_path: tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) else: raise ValueError( "You are instantiating a new tokenizer from scratch. This is not supported by this script." "You can do it from another script, save it, and load it from here, using --tokenizer_name." ) # endregion # region Dataset preprocessing # First we tokenize all the texts. column_names = raw_datasets["train"].column_names text_column_name = "text" if "text" in column_names else column_names[0] def tokenize_function(examples): return tokenizer(examples[text_column_name]) tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) if data_args.block_size is None: block_size = tokenizer.model_max_length if block_size > 1024: logger.warning( f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " "Picking 1024 instead. You can change that default value by passing --block_size xxx." ) block_size = 1024 else: if data_args.block_size > tokenizer.model_max_length: logger.warning( f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." ) block_size = min(data_args.block_size, tokenizer.model_max_length) # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. if total_length >= block_size: total_length = (total_length // block_size) * block_size # Split by chunks of max_len. result = { k: [t[i : i + block_size] for i in range(0, total_length, block_size)] for k, t in concatenated_examples.items() } result["labels"] = result["input_ids"].copy() return result # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower # to preprocess. # # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map lm_datasets = tokenized_datasets.map( group_texts, batched=True, num_proc=data_args.preprocessing_num_workers, load_from_cache_file=not data_args.overwrite_cache, desc=f"Grouping texts in chunks of {block_size}", ) train_dataset = lm_datasets["train"] if data_args.validation_file is not None: eval_dataset = lm_datasets["validation"] else: logger.info( f"Validation file not found: using {data_args.validation_split_percentage}% of the dataset as validation" " as provided in data_args" ) train_indices, val_indices = train_test_split( list(range(len(train_dataset))), test_size=data_args.validation_split_percentage / 100 ) eval_dataset = train_dataset.select(val_indices) train_dataset = train_dataset.select(train_indices) if data_args.max_train_samples is not None: max_train_samples = min(len(train_dataset), data_args.max_train_samples) train_dataset = train_dataset.select(range(max_train_samples)) if data_args.max_eval_samples is not None: max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples) eval_dataset = eval_dataset.select(range(max_eval_samples)) # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), min(3, len(train_dataset))): logger.info(f"Sample {index} of the training set: {train_dataset[index]}.") # endregion with training_args.strategy.scope(): # region Prepare model if checkpoint is not None: model = TFAutoModelForCausalLM.from_pretrained(checkpoint, config=config) elif model_args.model_name_or_path: model = TFAutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, config=config) else: logger.info("Training new model from scratch") model = TFAutoModelForCausalLM.from_config(config) model.resize_token_embeddings(len(tokenizer)) # endregion # region TF Dataset preparation num_replicas = training_args.strategy.num_replicas_in_sync options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset tf_train_dataset = model.prepare_tf_dataset( train_dataset, shuffle=True, batch_size=num_replicas * training_args.per_device_train_batch_size, ).with_options(options) tf_eval_dataset = model.prepare_tf_dataset( eval_dataset, shuffle=False, batch_size=num_replicas * training_args.per_device_eval_batch_size, drop_remainder=True, ).with_options(options) # endregion # region Optimizer and loss num_train_steps = len(tf_train_dataset) * int(training_args.num_train_epochs) if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 # Bias and layernorm weights are automatically excluded from the decay optimizer, lr_schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) # no user-specified loss = will use the model internal loss model.compile(optimizer=optimizer, jit_compile=training_args.xla) # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: if data_args.dataset_name is not None: push_to_hub_model_id = f"{model_name}-finetuned-{data_args.dataset_name}" else: push_to_hub_model_id = f"{model_name}-finetuned-clm" model_card_kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "text-generation"} if data_args.dataset_name is not None: model_card_kwargs["dataset_tags"] = data_args.dataset_name if data_args.dataset_config_name is not None: model_card_kwargs["dataset_args"] = data_args.dataset_config_name model_card_kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}" else: model_card_kwargs["dataset"] = data_args.dataset_name if training_args.push_to_hub: callbacks = [ PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, ) ] else: callbacks = [] # endregion # region Training and validation logger.info("***** Running training *****") logger.info(f" Num examples = {len(train_dataset)}") logger.info(f" Num Epochs = {training_args.num_train_epochs}") logger.info(f" Instantaneous batch size per device = {training_args.per_device_train_batch_size}") logger.info(f" Total train batch size = {training_args.per_device_train_batch_size * num_replicas}") # For long training runs, you may wish to use the PushToHub() callback here to save intermediate checkpoints # to the Hugging Face Hub rather than just pushing the finished model. # See https://huggingface.co/docs/transformers/main_classes/keras_callbacks#transformers.PushToHubCallback history = model.fit( tf_train_dataset, validation_data=tf_eval_dataset, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) train_loss = history.history["loss"][-1] try: train_perplexity = math.exp(train_loss) except OverflowError: train_perplexity = math.inf logger.info(f" Final train loss: {train_loss:.3f}") logger.info(f" Final train perplexity: {train_perplexity:.3f}") validation_loss = history.history["val_loss"][-1] try: validation_perplexity = math.exp(validation_loss) except OverflowError: validation_perplexity = math.inf logger.info(f" Final validation loss: {validation_loss:.3f}") logger.info(f" Final validation perplexity: {validation_perplexity:.3f}") if training_args.output_dir is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") results_dict = dict() results_dict["train_loss"] = train_loss results_dict["train_perplexity"] = train_perplexity results_dict["eval_loss"] = validation_loss results_dict["eval_perplexity"] = validation_perplexity with open(output_eval_file, "w") as writer: writer.write(json.dumps(results_dict)) # endregion if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)
import pickle, random import tensorflow as tf from itertools import chain from transformers import AutoTokenizer, TFAutoModelForCausalLM datafile = 'personachat_self_personalities.pickle' model_path = 'personachat-distilgpt2' N = 20 # Maximum number of words to output k = 10 # Top-k items to select from p = 0.8 # Top-p cumulative probability to select from # Load the trained model and tokenizer tokenizer = AutoTokenizer.from_pretrained(model_path) model = TFAutoModelForCausalLM.from_pretrained(model_path, use_cache=True, return_dict=False) bos = tokenizer.bos_token_id bot, person = tokenizer.encode('<bot>', '<person>') # Load personalities dataset with open(datafile, 'rb') as fp: personalities = pickle.load(fp) def feed(input_ids, speaker, past=None): return model({ 'input_ids': tf.constant(input_ids), 'token_type_ids': tf.fill(len(input_ids), int(speaker == person)) }, past) def new_chat():
from transformers import T5Tokenizer if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--model_dir", type=str, default="../data/model/huggingface_model") args = parser.parse_args() random.seed(42) tokenizer = T5Tokenizer.from_pretrained(args.model_dir, extra_ids=0) print(len(tokenizer)) pt_model = AutoModelForCausalLM.from_pretrained(args.model_dir) tf_model = TFAutoModelForCausalLM.from_pretrained(args.model_dir) prompt = "誰も到達していないArtificial Intelligenceの高みへ、ともに" with amp.autocast(): pt_tensor = tokenizer(prompt, return_tensors="pt")["input_ids"] output_sequences = pt_model.generate( input_ids=pt_tensor, max_length=50 + pt_tensor.size(1), top_p=0.95, top_k=50, do_sample=True, early_stopping=True, bos_token_id=tokenizer.bos_token_id, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id, num_return_sequences=1)