# overwrite batch size until we have tf_glue.py args.per_device_train_batch_size = 16 args.per_device_eval_batch_size = 16 # Set up logging logger = logging.getLogger(__name__) logging.basicConfig( level=logging.getLevelName("INFO"), handlers=[logging.StreamHandler(sys.stdout)], format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) # Load model and tokenizer model = TFAutoModelForSequenceClassification.from_pretrained(args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) # Load dataset train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"]) train_dataset = train_dataset.shuffle().select( range(5000) ) # smaller the size for train dataset to 5k test_dataset = test_dataset.shuffle().select( range(500) ) # smaller the size for test dataset to 500 # Preprocess train dataset train_dataset = train_dataset.map( lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True )
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Checkpoints # Detecting last checkpoint. checkpoint = None if len(os.listdir(training_args.output_dir) ) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and ( output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless.") # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Loading data # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than # a single grammatical sentence, when the task requires it. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. data_files = { "train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file } data_files = { key: file for key, file in data_files.items() if file is not None } for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.input_file_extension == "csv": # Loading a dataset from local csv files datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Label preprocessing # If you've passed us a training set, we try to infer your labels from it if "train" in datasets: # By default we assume that if your label column looks like a float then you're doing regression, # and if not then you're doing classification. This is something you may want to change! is_regression = datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # If you haven't passed a training set, we read label info from the saved model (this happens later) else: num_labels = None label_list = None is_regression = None # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path if num_labels is not None: config = AutoConfig.from_pretrained( config_path, num_labels=num_labels, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. column_names = { col for cols in datasets.column_names.values() for col in cols } non_label_column_names = [name for name in column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" elif "sentence1" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", None else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Ensure that our labels match the model's, if it has some pre-specified if "train" in datasets: if not is_regression and config.label2id != PretrainedConfig( num_labels=num_labels).label2id: label_name_to_id = config.label2id if list(sorted(label_name_to_id.keys())) == list( sorted(label_list)): label_to_id = label_name_to_id # Use the model's labels else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) label_to_id = {v: i for i, v in enumerate(label_list)} elif not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} else: label_to_id = None # Now we've established our label2id, let's overwrite the model config with it. config.label2id = label_to_id if config.label2id is not None: config.id2label = {id: label for label, id in label_to_id.items()} else: config.id2label = None else: label_to_id = config.label2id # Just load the data from the model if "validation" in datasets and config.label2id is not None: validation_label_list = datasets["validation"].unique("label") for val_label in validation_label_list: assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!" def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, max_length=max_seq_length, truncation=True) # Map labels to IDs if config.label2id is not None and "label" in examples: result["label"] = [(config.label2id[l] if l != -1 else -1) for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) # endregion with training_args.strategy.scope(): # region Load pretrained model # Set seed before initializing model set_seed(training_args.seed) # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Optimizer, loss and compilation optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) if is_regression: loss_fn = tf.keras.losses.MeanSquaredError() metrics = [] else: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) metrics = ["accuracy"] model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) # endregion # region Convert data to a tf.data.Dataset tf_data = dict() max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_val_samples, "test": data_args.max_test_samples, } for key in ("train", "validation", "test"): if key not in datasets: tf_data[key] = None continue if key in ("train", "validation"): assert "label" in datasets[ key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size drop_remainder = True # Saves us worrying about scaling gradients for the last batch else: shuffle = False batch_size = training_args.per_device_eval_batch_size drop_remainder = False samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) if isinstance( training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length: logger.info( "Padding all batches to max length because argument was set or we're on TPU." ) dataset_mode = "constant_batch" else: dataset_mode = "variable_batch" data = convert_dataset_for_tensorflow( dataset, non_label_column_names, batch_size=batch_size, dataset_mode=dataset_mode, drop_remainder=drop_remainder, shuffle=shuffle, ) tf_data[key] = data # endregion # region Training and validation if tf_data["train"] is not None: callbacks = [ SavePretrainedCallback(output_dir=training_args.output_dir) ] model.fit( tf_data["train"], validation_data=tf_data["validation"], epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) elif tf_data["validation"] is not None: # If there's a validation dataset but no training set, just evaluate the metrics logger.info("Computing metrics on validation data...") if is_regression: loss = model.evaluate(tf_data["validation"]) logger.info(f"Loss: {loss:.5f}") else: loss, accuracy = model.evaluate(tf_data["validation"]) logger.info( f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%") # endregion # region Prediction if tf_data["test"] is not None: logger.info("Doing predictions on test dataset...") predictions = model.predict(tf_data["test"])["logits"] predicted_class = np.squeeze( predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predicted_class): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = config.id2label[item] writer.write(f"{index}\t{item}\n") logger.info(f"Wrote predictions to {output_test_file}!") # endregion # region Prediction losses # This section is outside the scope() because it's very quick to compute, but behaves badly inside it if "test" in datasets and "label" in datasets["test"].features: print("Computing prediction loss on test labels...") labels = datasets["test"]["label"] loss = float(loss_fn(labels, predictions).numpy()) print(f"Test loss: {loss:.4f}")
def train(self): import tensorflow_addons as tfa import tensorflow as tf self.scheduler = None # Encoding train_texts and val_texts # x-digit code -> unique number self.classes = utils.Bidict( {c: i for i, c in enumerate(self.df_train['cat1'].unique())}) # Convert datasets to transformer format train_dataset = self._create_transformer_dataset(self.df_train) val_data = train_dataset if settings.VALID_PER_CLASS: val_data = self._create_transformer_dataset(self.df_valid) val_data = val_data.batch(settings.MINIBATCH) # Fine-tune the pre-trained model if self.train_with_tensorflow_directly: from transformers import TFAutoModelForSequenceClassification model = TFAutoModelForSequenceClassification.from_pretrained( self.pretrained_model_name, num_labels=len(self.classes), from_pt=True) # In AdamW weight decay value had no effect in our tests. # hard to find stable tuning, harder to beat vanilla Adam # optimizer = AdamW( # learning_rate=0.00005, # # 0.1 (default but might be too large), 0.01 # weight_decay=0.002 # ) if 1: learning_rate = settings.LEARNING_RATE if learning_rate is None: if settings.FULL_TEXT: # 5e-5 can fail on L1 e.g. Seed 7 # 1e-5 looks OK (TBC) on L1 # but not on L3 (no convergence) # 5e-6 very slow # Candidates: 5E-06 (40%), 7E-06 (49%) # FOR L3 # learning_rate = 7E-06 # FOR L1 # learning_rate = 5e-6 # legal-small ## learning_rate = 7e-6 # legal-base learning_rate = 1e-5 else: # learning_rate = 5e-5 learning_rate = 3e-5 print(f'\n LR = {learning_rate:0.0E}') from transformers import AdamWeightDecay optimizer = AdamWeightDecay( learning_rate=learning_rate, # 0.1 (default but might be too large), 0.01 # weight_decay_rate=0.0 ) else: if settings.FULL_TEXT: learning_rate = 1e-3 else: learning_rate = 7e-3 print(f'\n LR = {learning_rate:0.0E}') optimizer = tfa.optimizers.SGDW( learning_rate=learning_rate, momentum=0.0, weight_decay=0.00 # 0.1 (default but might be too large), 0.01 # weight_decay_rate=0.5 ) # optimizer = tf.keras.optimizers.MomentumOptimizer( # learning_rate=self.learning_rate # ) model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy']) callbacks = None self.scheduler = None if self.scheduler_class: self.scheduler = self.scheduler_class() callbacks = [self.scheduler] self.scheduler.pretrain(model, train_dataset) class_weight = None if settings.CLASS_WEIGHT: class_sizes = self.df_train.cat1.value_counts() class_sizes_max = class_sizes.max() class_weight = { i: class_sizes_max / class_sizes[c] for c, i in self.classes.items() } self.history = model.fit( train_dataset.batch(settings.MINIBATCH), epochs=self.max_epochs, batch_size=settings.MINIBATCH, validation_data=val_data, callbacks=callbacks, class_weight=class_weight, ) self.model = model self.render_training() else: # TODO: fix this. Almost immediate but random results # compared to TF method above. from transformers import (TFTrainer, TFTrainingArguments, TFAutoModelForSequenceClassification) # from transformers import TFAutoModel training_args = TFTrainingArguments( output_dir=self.results_path, # output directory num_train_epochs=self. max_epochs, # total number of training epochs per_device_train_batch_size=settings.MINIBATCH, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir=self.logs_path, # directory for storing logs ) print('h2') with training_args.strategy.scope(): # trainer_model = TFSequenceClassification.from_pretrained( # Transformers.pretrained_model, num_labels=len(self.classes) # ) trainer_model = TFAutoModelForSequenceClassification.from_pretrained( Transformers.pretrained_model_name, num_labels=len(self.classes)) print('h3') trainer = TFTrainer( model=trainer_model, args=training_args, train_dataset=train_dataset, eval_dataset=train_dataset, ) print('h4') trainer.train() # print(trainer.evaluate()) self.model = trainer_model
from transformers import (TFDistilBertForSequenceClassification, TFAutoModelForSequenceClassification, DistilBertTokenizerFast) # # Configurations # In[2]: model_name = 'distilbert-base-uncased' max_sequence_length = 256 # In[3]: # model = TFDistilBertForSequenceClassification.from_pretrained(model_name) model = TFAutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained( model_name, truncation=True, padding=True, max_length=max_sequence_length, return_tensors="tf") # # Tokenize # # Note that you need to convert the result of the ```tokenizer``` which is ```transformers.tokenization_utils_base.BatchEncoding``` instance into dictionary to feed into the model. # In[4]: tokenized = tokenizer(
def copy_model_files(self, force=False): modified = False src_path = self.checkpoint_path d = None try: if force or not (self.git_path / "tf_model.h5").exists() or not ( self.git_path / "pytorch_model.bin").exists(): d = TemporaryDirectory() if self.task in self.QA_TASKS: model = QASparseXP.compile_model(src_path, dest_path=d.name) elif self.task in self.GLUE_TASKS: model = GlueSparseXP.compile_model(src_path, dest_path=d.name) elif self.task in self.SUMMARIZATION_TASKS: model = SummarizationSparseXP.compile_model( src_path, dest_path=d.name) else: raise Exception(f"Unknown task {self.task}") model = optimize_model(model, "heads") model.save_pretrained(d.name) src_path = d.name if force or not (self.git_path / "tf_model.h5").exists(): with TemporaryDirectory() as d2: if self.task in self.QA_TASKS: QASparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForQuestionAnswering.from_pretrained( d2, from_pt=True) elif self.task in self.GLUE_TASKS: GlueSparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForSequenceClassification.from_pretrained( d2, from_pt=True) elif self.task in self.SUMMARIZATION_TASKS: SummarizationSparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForSeq2SeqLM.from_pretrained( d2, from_pt=True) else: raise Exception(f"Unknown task {self.task}") tf_model.save_pretrained(self.git_path) modified = True if force or not (self.git_path / "pytorch_model.bin").exists(): if self.task in self.QA_TASKS: model = AutoModelForQuestionAnswering.from_pretrained( src_path) elif self.task in self.GLUE_TASKS: model = AutoModelForSequenceClassification.from_pretrained( src_path) elif self.task in self.SUMMARIZATION_TASKS: model = AutoModelForSeq2SeqLM.from_pretrained(src_path) else: raise Exception(f"Unknown task {self.task}") model.save_pretrained(self.git_path) modified = True src_path = Path(src_path) to_copy = self.get_copy_list() for files, dest in to_copy: dest.mkdir(exist_ok=True) for file in files: if force or not (dest / file).exists(): shutil.copyfile(str(src_path / file), str(dest / file)) modified = True finally: if d is not None: d.cleanup() # Reload the config, this may have been changed by compilation / optimization (pruned_heads, gelu_patch, layer_norm_patch) with (self.git_path / "config.json").open() as f: self.checkpoint_info["config"] = json.load(f) return modified
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select( range(1000)) small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select( range(1000)) full_train_dataset = tokenized_datasets["train"] full_eval_dataset = tokenized_datasets["test"] import tensorflow as tf from transformers import TFAutoModelForSequenceClassification from datetime import datetime model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2) tf_train_dataset = small_train_dataset.remove_columns( ["text"]).with_format("tensorflow") tf_eval_dataset = small_eval_dataset.remove_columns( ["text"]).with_format("tensorflow") train_features = { x: tf_train_dataset[x].to_tensor() for x in tokenizer.model_input_names } train_tf_dataset = tf.data.Dataset.from_tensor_slices( (train_features, tf_train_dataset["label"])) train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8) eval_features = {
app = Flask(__name__) CORS(app) load_dotenv() """ Load all the models and such """ default_sentiment = pipeline("sentiment-analysis") bart_summarization = pipeline("summarization") t5_summarization = pipeline("summarization", model="t5-small", tokenizer="t5-small", framework="tf") distilbert_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True) distilbert_regression_model = TFAutoModelForSequenceClassification.from_pretrained( "models/distilbert/regression") # make a directory to download lightfm models to if not os.path.exists('lightfm'): os.makedirs('lightfm') # get lightFM model and dataset objects lightFM_model = get_lightfm_model() lightFM_dataset = get_lightfm_dataset() """ End of loading models and such """ """ Define prediction and helper function """ class Summarization: def __init__(self, model): self.model = model
def __init__(self): self.config = AutoConfig.from_pretrained(CZERT_MODEL_PATH, num_labels=1) self.tokenizer = BertTokenizerFast(os.path.join(CZERT_MODEL_PATH, "vocab.txt"), strip_accents=False, do_lower_case=False) self.model = TFAutoModelForSequenceClassification.from_pretrained(CZERT_MODEL_PATH, config=self.config)
def __call__(self, text_inputs): raw_outputs = self.model(text_inputs) outputs = [] for output in raw_outputs: score = output["score"] if output["label"] == "POSITIVE": outputs.append([1 - score, score]) else: outputs.append([score, 1 - score]) return np.array(outputs) # Create the model: a French sentiment analysis model. # see https://github.com/TheophileBlard/french-sentiment-analysis-with-bert model = TFAutoModelForSequenceClassification.from_pretrained( "tblard/tf-allocine") tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine") pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) model_wrapper = HuggingFaceSentimentAnalysisPipelineWrapper(pipeline) # Create the recipe: PWWS uses a WordNet transformation. recipe = PWWSRen2019.build(model_wrapper) # WordNet defaults to english. Set the default language to French ('fra') # # See # "Building a free French wordnet from multilingual resources", # E. L. R. A. (ELRA) (ed.), # Proceedings of the Sixth International Language Resources and Evaluation (LREC’08). recipe.transformation.language = "fra"
def _load_local_model(self, path): self._model = TFAutoModelForSequenceClassification.from_pretrained( path) self._tokenizer = AutoTokenizer.from_pretrained(path)
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_text_classification", model_args, data_args, framework="tensorflow") output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Checkpoints # Detecting last checkpoint. checkpoint = None if len(os.listdir(training_args.output_dir) ) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and ( output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless.") # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Loading data # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than # a single grammatical sentence, when the task requires it. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. data_files = { "train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file } data_files = { key: file for key, file in data_files.items() if file is not None } for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.input_file_extension == "csv": # Loading a dataset from local csv files datasets = load_dataset( "csv", data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Label preprocessing # If you've passed us a training set, we try to infer your labels from it if "train" in datasets: # By default we assume that if your label column looks like a float then you're doing regression, # and if not then you're doing classification. This is something you may want to change! is_regression = datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # If you haven't passed a training set, we read label info from the saved model (this happens later) else: num_labels = None label_list = None is_regression = None # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path if num_labels is not None: config = AutoConfig.from_pretrained( config_path, num_labels=num_labels, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. column_names = { col for cols in datasets.column_names.values() for col in cols } non_label_column_names = [name for name in column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" elif "sentence1" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", None else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Ensure that our labels match the model's, if it has some pre-specified if "train" in datasets: if not is_regression and config.label2id != PretrainedConfig( num_labels=num_labels).label2id: label_name_to_id = config.label2id if list(sorted(label_name_to_id.keys())) == list( sorted(label_list)): label_to_id = label_name_to_id # Use the model's labels else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels:" f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.", ) label_to_id = {v: i for i, v in enumerate(label_list)} elif not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} else: label_to_id = None # Now we've established our label2id, let's overwrite the model config with it. config.label2id = label_to_id if config.label2id is not None: config.id2label = {id: label for label, id in label_to_id.items()} else: config.id2label = None else: label_to_id = config.label2id # Just load the data from the model if "validation" in datasets and config.label2id is not None: validation_label_list = datasets["validation"].unique("label") for val_label in validation_label_list: assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!" def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, max_length=max_seq_length, truncation=True) # Map labels to IDs if config.label2id is not None and "label" in examples: result["label"] = [(config.label2id[l] if l != -1 else -1) for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) # endregion with training_args.strategy.scope(): # region Load pretrained model # Set seed before initializing model set_seed(training_args.seed) # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Convert data to a tf.data.Dataset dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF num_replicas = training_args.strategy.num_replicas_in_sync tf_data = dict() max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_val_samples, "test": data_args.max_test_samples, } for key in ("train", "validation", "test"): if key not in datasets: tf_data[key] = None continue if ((key == "train" and not training_args.do_train) or (key == "validation" and not training_args.do_eval) or (key == "test" and not training_args.do_predict)): tf_data[key] = None continue if key in ("train", "validation"): assert "label" in datasets[ key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size * num_replicas else: shuffle = False batch_size = training_args.per_device_eval_batch_size * num_replicas samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset data = model.prepare_tf_dataset( dataset, shuffle=shuffle, batch_size=batch_size, tokenizer=tokenizer, ) data = data.with_options(dataset_options) tf_data[key] = data # endregion # region Optimizer, loss and compilation if training_args.do_train: num_train_steps = len( tf_data["train"]) * training_args.num_train_epochs if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 optimizer, schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) else: optimizer = None if is_regression: metrics = [] else: metrics = ["accuracy"] model.compile(optimizer=optimizer, metrics=metrics) # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: push_to_hub_model_id = f"{model_name}-finetuned-text-classification" model_card_kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "text-classification" } if training_args.push_to_hub: callbacks = [ PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, ) ] else: callbacks = [] # endregion # region Training and validation if tf_data["train"] is not None: model.fit( tf_data["train"], validation_data=tf_data["validation"], epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) if tf_data["validation"] is not None: logger.info("Computing metrics on validation data...") if is_regression: loss = model.evaluate(tf_data["validation"]) logger.info(f"Eval loss: {loss:.5f}") else: loss, accuracy = model.evaluate(tf_data["validation"]) logger.info( f"Eval loss: {loss:.5f}, Eval accuracy: {accuracy * 100:.4f}%" ) if training_args.output_dir is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") eval_dict = {"eval_loss": loss} if not is_regression: eval_dict["eval_accuracy"] = accuracy with open(output_eval_file, "w") as writer: writer.write(json.dumps(eval_dict)) # endregion # region Prediction if tf_data["test"] is not None: logger.info("Doing predictions on test dataset...") predictions = model.predict(tf_data["test"])["logits"] predicted_class = np.squeeze( predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predicted_class): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = config.id2label[item] writer.write(f"{index}\t{item}\n") logger.info(f"Wrote predictions to {output_test_file}!") # endregion if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_glue", model_args, data_args, framework="tensorflow") if not (training_args.do_train or training_args.do_eval or training_args.do_predict): exit( "Must specify at least one of --do_train, --do_eval or --do_predict!" ) # endregion # region Checkpoints checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: checkpoint = get_last_checkpoint(training_args.output_dir) if checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Dataset and labels # Set seed before initializing model. set_seed(training_args.seed) # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee # that only one local process can concurrently download the dataset. datasets = load_dataset( "glue", data_args.task_name, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 if data_args.predict_file is not None: logger.info("Preparing user-supplied file for predictions...") data_files = {"data": data_args.predict_file} for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.predict_file.endswith(".csv"): # Loading a dataset from local csv files user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) needed_keys = task_to_keys[data_args.task_name] for key in needed_keys: assert key in user_dataset[ "data"].features, f"Your supplied predict_file is missing the {key} key!" datasets["user_data"] = user_dataset["data"] # endregion # region Load model config and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing sentence1_key, sentence2_key = task_to_keys[data_args.task_name] # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if config.label2id != PretrainedConfig( num_labels=num_labels).label2id and not is_regression: # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: int(label_name_to_id[label_list[i]]) for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) label_to_id = {label: i for i, label in enumerate(label_list)} if label_to_id is not None: config.label2id = label_to_id config.id2label = {id: label for label, id in config.label2id.items()} elif data_args.task_name is not None and not is_regression: config.label2id = {l: i for i, l in enumerate(label_list)} config.id2label = {id: label for label, id in config.label2id.items()} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion # region Metric function metric = evaluate.load("glue", data_args.task_name) def compute_metrics(preds, label_ids): preds = preds["logits"] preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) result = metric.compute(predictions=preds, references=label_ids) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result # endregion with training_args.strategy.scope(): # region Load pretrained model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Convert data to a tf.data.Dataset dataset_options = tf.data.Options() dataset_options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF num_replicas = training_args.strategy.num_replicas_in_sync tf_data = dict() max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_eval_samples, "validation_matched": data_args.max_eval_samples, "validation_mismatched": data_args.max_eval_samples, "test": data_args.max_predict_samples, "test_matched": data_args.max_predict_samples, "test_mismatched": data_args.max_predict_samples, "user_data": None, } for key in datasets.keys(): if key == "train" or key.startswith("validation"): assert "label" in datasets[ key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size * num_replicas else: shuffle = False batch_size = training_args.per_device_eval_batch_size * num_replicas samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) # model.prepare_tf_dataset() wraps a Hugging Face dataset in a tf.data.Dataset which is ready to use in # training. This is the recommended way to use a Hugging Face dataset when training with Keras. You can also # use the lower-level dataset.to_tf_dataset() method, but you will have to specify things like column names # yourself if you use this method, whereas they are automatically inferred from the model input names when # using model.prepare_tf_dataset() # For more info see the docs: # https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.TFPreTrainedModel.prepare_tf_dataset # https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.to_tf_dataset data = model.prepare_tf_dataset( dataset, shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator, tokenizer=tokenizer, ) data = data.with_options(dataset_options) tf_data[key] = data # endregion # region Optimizer, loss and compilation if training_args.do_train: num_train_steps = len( tf_data["train"]) * training_args.num_train_epochs if training_args.warmup_steps > 0: num_warmup_steps = training_args.warmup_steps elif training_args.warmup_ratio > 0: num_warmup_steps = int(num_train_steps * training_args.warmup_ratio) else: num_warmup_steps = 0 optimizer, schedule = create_optimizer( init_lr=training_args.learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, adam_beta1=training_args.adam_beta1, adam_beta2=training_args.adam_beta2, adam_epsilon=training_args.adam_epsilon, weight_decay_rate=training_args.weight_decay, adam_global_clipnorm=training_args.max_grad_norm, ) else: optimizer = "adam" # Just write anything because we won't be using it if is_regression: metrics = [] else: metrics = ["accuracy"] model.compile(optimizer=optimizer, metrics=metrics, jit_compile=training_args.xla) # endregion # region Preparing push_to_hub and model card push_to_hub_model_id = training_args.push_to_hub_model_id model_name = model_args.model_name_or_path.split("/")[-1] if not push_to_hub_model_id: push_to_hub_model_id = f"{model_name}-finetuned-glue" model_card_kwargs = { "finetuned_from": model_args.model_name_or_path, "tasks": "text-classification" } model_card_kwargs["task_name"] = data_args.task_name if training_args.push_to_hub: callbacks = [ PushToHubCallback( output_dir=training_args.output_dir, model_id=push_to_hub_model_id, organization=training_args.push_to_hub_organization, token=training_args.push_to_hub_token, tokenizer=tokenizer, **model_card_kwargs, ) ] else: callbacks = [] # endregion # region Training and validation if training_args.do_train: if training_args.do_eval and not data_args.task_name == "mnli": # Do both evaluation and training in the Keras fit loop, unless the task is MNLI # because MNLI has two validation sets validation_data = tf_data["validation"] else: validation_data = None model.fit( tf_data["train"], validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) # endregion # region Evaluation if training_args.do_eval: # We normally do validation as part of the Keras fit loop, but we run it independently # if there was no fit() step (because we didn't train the model) or if the task is MNLI, # because MNLI has a separate validation-mismatched validation set # In this example, we compute advanced metrics only at the end of training, and only compute # loss and accuracy on the validation set each epoch, but # if you'd like to compute metrics every epoch that are too complex to be written as # standard Keras metrics, you can use our KerasMetricCallback. See # https://huggingface.co/docs/transformers/main/en/main_classes/keras_callbacks logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) if data_args.task_name == "mnli": tasks = ["mnli", "mnli-mm"] tf_datasets = [ tf_data["validation_matched"], tf_data["validation_mismatched"] ] raw_datasets = [ datasets["validation_matched"], datasets["validation_mismatched"] ] else: tasks = [data_args.task_name] tf_datasets = [tf_data["validation"]] raw_datasets = [datasets["validation"]] for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): eval_predictions = model.predict(tf_dataset) eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"]) print(f"Evaluation metrics ({task}):") print(eval_metrics) if training_args.output_dir is not None: output_eval_file = os.path.join(training_args.output_dir, "all_results.json") with open(output_eval_file, "w") as writer: writer.write(json.dumps(eval_metrics)) # endregion # region Prediction if training_args.do_predict or data_args.predict_file: logger.info("*** Predict ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [] tf_datasets = [] raw_datasets = [] if training_args.do_predict: if data_args.task_name == "mnli": tasks.extend(["mnli", "mnli-mm"]) tf_datasets.extend( [tf_data["test_matched"], tf_data["test_mismatched"]]) raw_datasets.extend([ datasets["test_matched"], datasets["test_mismatched"] ]) else: tasks.append(data_args.task_name) tf_datasets.append(tf_data["test"]) raw_datasets.append(datasets["test"]) if data_args.predict_file: tasks.append("user_data") tf_datasets.append(tf_data["user_data"]) raw_datasets.append(datasets["user_data"]) for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): test_predictions = model.predict(tf_dataset) if "label" in raw_dataset: test_metrics = compute_metrics(test_predictions, raw_dataset["label"]) print(f"Test metrics ({task}):") print(test_metrics) if is_regression: predictions_to_write = np.squeeze( test_predictions["logits"]) else: predictions_to_write = np.argmax( test_predictions["logits"], axis=1) output_predict_file = os.path.join( training_args.output_dir, f"predict_results_{task}.txt") with open(output_predict_file, "w") as writer: logger.info( f"***** Writing prediction results for {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions_to_write): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = model.config.id2label[item] writer.write(f"{index}\t{item}\n") # endregion if training_args.output_dir is not None and not training_args.push_to_hub: # If we're not pushing to hub, at least save a local copy when we're done model.save_pretrained(training_args.output_dir)
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification from transformers import pipeline import os tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine") path = '/model/tf_model.h5' if os.path.exists(path): model = TFAutoModelForSequenceClassification.from_pretrained( './model/', local_files_only=True) else: model = TFAutoModelForSequenceClassification.from_pretrained( "tblard/tf-allocine") model.save_pretrained('./model/') nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
x = Dropout(0.5)(x) x = Dense(32, activation=gelu)(cls_token) x = BatchNormalization()(x) x = Dropout(0.3)(x) out = Dense(1, activation='sigmoid')(x) model = Model(inputs=input_word_ids, outputs=out) model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy',AUC(name='auc')]) return model %%time from transformers import TFAutoModel from transformers import TFAutoModelForSequenceClassification with strategy.scope(): transformer_layer = TFAutoModel.from_pretrained('jplu/tf-xlm-roberta-base') transformer_layer = TFAutoModelForSequenceClassification.from_pretrained('jplu/tf-xlm-roberta-base') model = build_model(transformer_layer, max_len=MAX_LEN) model.summary() # def build_model_working(transformer, max_len): # """ # from transformers import TFAutoModelForSequenceClassification # transformer = TFAutoModelForSequenceClassification.from_pretrained('jplu/tf-xlm-roberta-base') # """ # input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids") # sequence_output = transformer(input_word_ids)[0] # out = Dense(1, activation='sigmoid')(sequence_output) # model = Model(inputs=input_word_ids, outputs=out) # model.compile(Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy',AUC(name='auc')])
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) train_dataset, eval_dataset, test_ds, label2id = get_tfds( train_file=data_args.train_file, eval_file=data_args.dev_file, test_file=data_args.test_file, tokenizer=tokenizer, label_column_id=data_args.label_column_id, max_seq_length=data_args.max_seq_length, ) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=len(label2id), label2id=label2id, id2label={id: label for label, id in label2id.items()}, finetuning_task="text-classification", cache_dir=model_args.cache_dir, ) with training_args.strategy.scope(): model = TFAutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": (preds == p.label_ids).mean()} # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, GlueDataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.n_gpu, bool(training_args.n_gpu > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) try: num_labels = glue_tasks_num_labels["mnli" if data_args.task_name == "mnli-mm" else data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) with training_args.strategy.scope(): model = TFAutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (get_tfds(task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length) if training_args.do_train else None) eval_dataset = (get_tfds(task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, mode=Split.dev) if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids) # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
"""helper functions for the flask API.""" import numpy as np from transformers import AutoTokenizer, TFAutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased', use_fast=True) model = TFAutoModelForSequenceClassification.from_pretrained("spentaur/yelp") def infer(review: str) -> float: """returns raw model sentiment prediction for a review.""" return model(tokenizer.encode(review, return_tensors='tf'))[0].numpy()[0][0] def squish(x: float) -> float: """converts model inferences to value between zero and one.""" return np.clip(0.25 * x + 0.5, 0, 1)
def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs): do_lower_case = False if 'uncased' in model_name.lower(): do_lower_case = True tokenizer_kwargs.update({'do_lower_case': do_lower_case}) self._tokenizer = None self._model = None self._config = None self._tokenizer = AutoTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._config = AutoConfig.from_pretrained(model_name) temporary_path = self._get_temporary_path() make_dir(temporary_path) # TensorFlow model try: self._model = TFAutoModelForSequenceClassification.from_pretrained( model_name, from_pt=False) # PyTorch model except TypeError: try: self._model = TFAutoModelForSequenceClassification.from_pretrained( model_name, from_pt=True) # Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name except OSError: model = AutoModel.from_pretrained(model_name) model.save_pretrained(temporary_path) self._model = TFAutoModelForSequenceClassification.from_pretrained( temporary_path, from_pt=True) # Clean the model's last layer if the provided properties are different clean_last_layer = False for key, value in model_kwargs.items(): if not hasattr(self._model.config, key): clean_last_layer = True break if getattr(self._model.config, key) != value: clean_last_layer = True break if clean_last_layer: model_family = self._get_model_family() try: getattr( self._model, self._get_model_family()).save_pretrained(temporary_path) self._model = self._model.__class__.from_pretrained( temporary_path, from_pt=False, **model_kwargs) # The model is itself the main layer except AttributeError: # TensorFlow model try: self._model = self._model.__class__.from_pretrained( model_name, from_pt=False, **model_kwargs) # PyTorch Model except (OSError, TypeError): model = AutoModel.from_pretrained(model_name) model.save_pretrained(temporary_path) self._model = self._model.__class__.from_pretrained( temporary_path, from_pt=True, **model_kwargs) remove_dir(temporary_path) assert self._tokenizer and self._model
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if not (training_args.do_train or training_args.do_eval or training_args.do_predict): exit( "Must specify at least one of --do_train, --do_eval or --do_predict!" ) # endregion # region Checkpoints checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: checkpoint = get_last_checkpoint(training_args.output_dir) if checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Dataset and labels # Set seed before initializing model. set_seed(training_args.seed) # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee # that only one local process can concurrently download the dataset. datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 if data_args.predict_file is not None: logger.info("Preparing user-supplied file for predictions...") data_files = {"data": data_args.predict_file} for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.predict_file.endswith(".csv"): # Loading a dataset from local csv files user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) needed_keys = task_to_keys[data_args.task_name] for key in needed_keys: assert key in user_dataset[ "data"].features, f"Your supplied predict_file is missing the {key} key!" datasets["user_data"] = user_dataset["data"] # endregion # region Load model config and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing sentence1_key, sentence2_key = task_to_keys[data_args.task_name] non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if config.label2id != PretrainedConfig( num_labels=num_labels).label2id and not is_regression: # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: int(label_name_to_id[label_list[i]]) for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) label_to_id = {label: i for i, label in enumerate(label_list)} if label_to_id is not None: config.label2id = label_to_id config.id2label = {id: label for label, id in config.label2id.items()} elif data_args.task_name is not None and not is_regression: config.label2id = {l: i for i, l in enumerate(label_list)} config.id2label = {id: label for label, id in config.label2id.items()} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) # endregion # region Metric function metric = load_metric("glue", data_args.task_name) def compute_metrics(preds, label_ids): preds = preds["logits"] preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) result = metric.compute(predictions=preds, references=label_ids) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result # endregion with training_args.strategy.scope(): # region Load pretrained model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Optimizer, loss and compilation optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) if is_regression: loss_fn = tf.keras.losses.MeanSquaredError() metrics = [] else: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) metrics = ["accuracy"] model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) # endregion # region Convert data to a tf.data.Dataset tf_data = dict() if isinstance( training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length: logger.info( "Padding all batches to max length because argument was set or we're on TPU." ) dataset_mode = "constant_batch" else: dataset_mode = "variable_batch" max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_eval_samples, "validation_matched": data_args.max_eval_samples, "validation_mismatched": data_args.max_eval_samples, "test": data_args.max_predict_samples, "test_matched": data_args.max_predict_samples, "test_mismatched": data_args.max_predict_samples, "user_data": None, } for key in datasets.keys(): if key == "train" or key.startswith("validation"): assert "label" in datasets[ key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size drop_remainder = True # Saves us worrying about scaling gradients for the last batch else: shuffle = False batch_size = training_args.per_device_eval_batch_size drop_remainder = False samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) data = convert_dataset_for_tensorflow( dataset, non_label_column_names, batch_size=batch_size, dataset_mode=dataset_mode, drop_remainder=drop_remainder, shuffle=shuffle, ) tf_data[key] = data # endregion # region Training and validation if training_args.do_train: callbacks = [ SavePretrainedCallback(output_dir=training_args.output_dir) ] if training_args.do_eval and not data_args.task_name == "mnli": # Do both evaluation and training in the Keras fit loop, unless the task is MNLI # because MNLI has two validation sets validation_data = tf_data["validation"] else: validation_data = None model.fit( tf_data["train"], validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) # endregion # region Evaluation if training_args.do_eval: # We normally do validation as part of the Keras fit loop, but we run it independently # if there was no fit() step (because we didn't train the model) or if the task is MNLI, # because MNLI has a separate validation-mismatched validation set logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) if data_args.task_name == "mnli": tasks = ["mnli", "mnli-mm"] tf_datasets = [ tf_data["validation_matched"], tf_data["validation_mismatched"] ] raw_datasets = [ datasets["validation_matched"], datasets["validation_mismatched"] ] else: tasks = [data_args.task_name] tf_datasets = [tf_data["validation"]] raw_datasets = [datasets["validation"]] for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): eval_predictions = model.predict(tf_dataset) eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"]) print(f"Evaluation metrics ({task}):") print(eval_metrics) # endregion # region Prediction if training_args.do_predict or data_args.predict_file: logger.info("*** Predict ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [] tf_datasets = [] raw_datasets = [] if training_args.do_predict: if data_args.task_name == "mnli": tasks.extend(["mnli", "mnli-mm"]) tf_datasets.extend( [tf_data["test_matched"], tf_data["test_mismatched"]]) raw_datasets.extend([ datasets["test_matched"], datasets["test_mismatched"] ]) else: tasks.append(data_args.task_name) tf_datasets.append(tf_data["test"]) raw_datasets.append(datasets["test"]) if data_args.predict_file: tasks.append("user_data") tf_datasets.append(tf_data["user_data"]) raw_datasets.append(datasets["user_data"]) for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): test_predictions = model.predict(tf_dataset) if "label" in raw_dataset: test_metrics = compute_metrics(test_predictions, raw_dataset["label"]) print(f"Test metrics ({task}):") print(test_metrics) if is_regression: predictions_to_write = np.squeeze( test_predictions["logits"]) else: predictions_to_write = np.argmax( test_predictions["logits"], axis=1) output_predict_file = os.path.join( training_args.output_dir, f"predict_results_{task}.txt") with open(output_predict_file, "w") as writer: logger.info( f"***** Writing prediction results for {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions_to_write): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = model.config.id2label[item] writer.write(f"{index}\t{item}\n")
# -*- coding: utf-8 -*- """ Created on Mon Feb 15 03:40:33 2021 @author: nashe """ import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # or any {'0', '1', '2'} from transformers import TextClassificationPipeline, TFAutoModelForSequenceClassification, AutoTokenizer MODEL_DIR = r"D:\Fine-tuned Models\NLP\bert\tf-distilbert-base-uncased\epoch_1_loss_0.39" # Feature extraction pipeline model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_DIR) tokenizer = AutoTokenizer.from_pretrained(r"D:\Models\NLP\bert\tf-distilbert-base-uncased") pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, framework='tf', device=0) result = pipeline("It was a good watch. But a little boring.")[0]