def get_model( base_path: str, num_classes: int = 2, class_map: Dict[str, str] = {}, freeze_base: bool = True, show_summary: bool = False) -> TFAutoModelForSequenceClassification: if len(class_map) > 0: rev_class_map: Dict[str, str] = {v: k for k, v in class_map.items()} model = TFAutoModelForSequenceClassification.from_pretrained( base_path, num_labels=num_classes, id2label=class_map, label2id=rev_class_map) else: model = TFAutoModelForSequenceClassification.from_pretrained( base_path, num_labels=num_classes) if freeze_base: model.layers[0].trainable = False optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3) # Set from_logits=True as there is no Softmax/Sigmoid layer added in Huggingface Models model.compile(optimizer, loss=tf.keras.losses.CategoricalCrossentropy( from_logits=True) if num_classes > 2 else tf.keras.losses.BinaryCrossentropy(from_logits=True)) if show_summary: model.summary() return model
async def train(self, sources: Sources): self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.tokenizer_name if self.parent.config.tokenizer_name else self.parent.config.model_name_or_path, cache_dir=self.parent.config.cache_dir, ) with self.parent.config.strategy.scope(): self.model = TFAutoModelForSequenceClassification.from_pretrained( self.parent.config.model_name_or_path, from_pt=self.parent.config.from_pt, config=self.config, cache_dir=self.parent.config.cache_dir, ) train_features = await self._preprocess_data(sources) train_dataset = await self.example_features_to_dataset(train_features) trainer = TFTrainer( model=self.model, args=self.parent.config, train_dataset=train_dataset, ) trainer.train() self.logger.info("Saving model to %s", self.parent.config.output_dir) trainer.save_model() self.tokenizer.save_pretrained(self.parent.config.output_dir)
async def accuracy(self, sources: Sources): if not os.path.isfile( os.path.join(self.parent.config.output_dir, "tf_model.h5")): raise ModelNotTrained("Train model before assessing for accuracy.") config = self.parent.config._asdict() self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.output_dir) eval_features = await self._preprocess_data(sources) eval_dataset = await self.example_features_to_dataset(eval_features) def compute_metrics(p: EvalPrediction) -> Dict: preds = self.np.argmax(p.predictions, axis=1) return classification_compute_metrics(preds, p.label_ids) with self.parent.config.strategy.scope(): self.model = TFAutoModelForSequenceClassification.from_pretrained( config["directory"]) trainer = TFTrainer( model=self.model, args=self.parent.config, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) result = trainer.evaluate() return Accuracy(result["eval_acc"])
def __init__(self, ): self.model = TFAutoModelForSequenceClassification.from_pretrained( 'C:/Users/TwitterCovid-19/ANACONDA_WORKBENCH/SENTIMENT ANALYSIS/CT-BERT_FineTuned' ) self.tokenizer = AutoTokenizer.from_pretrained( 'digitalepidemiologylab/covid-twitter-bert-v2', add_special_tokens=True) print("CT-BERT model and tokenizer initialized")
def __init__(self, model_path = "./pretrained_models/sentiment_analysis/", threshold = 0.5): pass self.threshold = threshold # If the score of a comment is below this threshold, it is considered neutral self.tokenizer = AutoTokenizer.from_pretrained("./pretrained_models/sentiment_analysis/") self.model = TFAutoModelForSequenceClassification.from_pretrained("./pretrained_models/sentiment_analysis/") self.nlp = pipeline('sentiment-analysis', model = self.model, tokenizer = self.tokenizer)
def __init__(self, extractor, config, *args, **kwargs): super(TFBERTMaxP_Class, self).__init__(*args, **kwargs) self.extractor = extractor # TODO hidden prob missing below? if config["pretrained"] == "electra-base-msmarco": self.bert = TFAutoModelForSequenceClassification.from_pretrained("Capreolus/electra-base-msmarco") dropout, fc = self.bert.classifier.dropout, self.bert.classifier.out_proj self.bert.classifier = TFElectraRelevanceHead(dropout, fc) elif config["pretrained"] == "bert-base-msmarco": self.bert = TFAutoModelForSequenceClassification.from_pretrained("Capreolus/bert-base-msmarco") else: self.bert = TFAutoModelForSequenceClassification.from_pretrained( config["pretrained"], hidden_dropout_prob=config["hidden_dropout_prob"] ) self.config = config
def _load_local_model(self, model_path): try: self._tokenizer = AutoTokenizer.from_pretrained(model_path + '/tokenizer') # Old models didn't use to have a tokenizer folder except OSError: self._tokenizer = AutoTokenizer.from_pretrained(model_path) self._model = TFAutoModelForSequenceClassification.from_pretrained( model_path, from_pt=False)
def __init__(self): self.tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine", use_fast=True) self.classifier = TFAutoModelForSequenceClassification.from_pretrained( "tblard/tf-allocine") self.sentiment_analyzer = pipeline('sentiment-analysis', model=self.classifier, tokenizer=self.tokenizer)
def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs): do_lower_case = False if 'uncased' in model_name.lower(): do_lower_case = True tokenizer_kwargs.update({'do_lower_case': do_lower_case}) self._tokenizer = None self._model = None self._tokenizer = AutoTokenizer.from_pretrained( model_name, **tokenizer_kwargs) temporary_path = self._get_temporary_path() make_dir(temporary_path) # TensorFlow model try: self._model = TFAutoModelForSequenceClassification.from_pretrained( model_name, from_pt=False) # PyTorch model except TypeError: try: self._model = TFAutoModelForSequenceClassification.from_pretrained( model_name, from_pt=True) # Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name except OSError: model = AutoModel.from_pretrained(model_name) model.save_pretrained(temporary_path) self._model = TFAutoModelForSequenceClassification.from_pretrained( temporary_path, from_pt=True) # Reset base model if the number of labels does not match if self._model.config.num_labels != model_kwargs['num_labels']: self._model.config.__dict__.update(model_kwargs) getattr(self._model, self._get_model_family()).save_pretrained(temporary_path) self._model = TFAutoModelForSequenceClassification.from_pretrained( temporary_path, from_pt=False) remove_dir(temporary_path) assert self._tokenizer and self._model
def test_sequence_classification_model_from_pretrained(self): # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = TFAutoModelForSequenceClassification.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForSequenceClassification)
def initialize(self): """ Initializes everything, takes a bit of time """ cache_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "storage", "hfcache") # Load pretrained model and tokenizer model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine", cache_dir=cache_path) tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine", cache_dir=cache_path) self.nlp = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
def test_sequence_classification_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = TFAutoModelForSequenceClassification.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForSequenceClassification)
def get_sentiment_model( model_name="distilbert-base-uncased-finetuned-sst-2-english", cache_dir="transformers_models/"): if not (isinstance(model_name, str) and isinstance(cache_dir, str)): raise SentimentModelInputTypeError() tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir, local_files_only=False) model = TFAutoModelForSequenceClassification.from_pretrained( model_name, cache_dir=cache_dir, local_files_only=False) return model, tokenizer
async def predict( self, sources: SourcesContext ) -> AsyncIterator[Tuple[Record, Any, float]]: if not os.path.isfile( os.path.join(self.parent.config.output_dir, "tf_model.h5") ): raise ModelNotTrained("Train model before prediction.") self.tokenizer = AutoTokenizer.from_pretrained( self.parent.config.output_dir ) with self.parent.config.strategy.scope(): self.model = TFAutoModelForSequenceClassification.from_pretrained( self.parent.config.output_dir ) trainer = TFTrainer(model=self.model, args=self.parent.config,) async for record in sources.with_features(self.features): to_predict = record.features(self.features) eval_example = [ InputExample( 0, to_predict[self.features[0]], None, self.parent.config.label_list[0], ) ] eval_features = glue_convert_examples_to_features( eval_example, self.tokenizer, self.parent.config.max_seq_length, self.parent.config.task_name, self.parent.config.label_list, ) eval_dataset = await self.example_features_to_dataset( eval_features ) all_prob = trainer.predict(eval_dataset).predictions max_prob_idx = all_prob.argmax(axis=-1) self.logger.debug( "Predicted probability of {} for {}: {}".format( self.parent.config.predict.name, to_predict, all_prob[0], ) ) record.predicted( self.parent.config.predict.name, self.parent.config.label_list[max_prob_idx[0]], all_prob[0][max_prob_idx[0]], ) yield record
def Sentiment(text): from transformers import AutoTokenizer, TFAutoModelForSequenceClassification from transformers import pipeline model_name = "distilbert-base-uncased-finetuned-sst-2-english" tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) classifier = pipeline('sentiment-analysis', model=tf_model, tokenizer=tokenizer) results = classifier(text) for result in results: print( f"label: {result['label']}, with score: {round(result['score'], 4)}" ) return results
def compile(self, lr, freeze_pretrained): loss = 'binary_crossentropy' # Wrap up model + Compile with optimizer and loss function # self.model = Model(inputs=word_inputs, outputs=[outputs]) '''self.model = TFBertForSequenceClassification.from_pretrained(Configuration['model']['uri'], num_labels=4271)''' # Import the needed model(Bert, Roberta or DistilBert) with output_hidden_states=True if "longformer" in Configuration["model"]["uri"]: transformer_model = TFLongformerForSequenceClassification.from_pretrained( Configuration['model']['uri'], num_labels=4271) else: transformer_model = TFAutoModelForSequenceClassification.from_pretrained( Configuration['model']['uri'], num_labels=4271) if freeze_pretrained: """ [ < transformers.modeling_tf_bert.TFBertMainLayer at 0x169a23e10 >, < tensorflow.python.keras.layers.core.Dropout at 0x169abde90 >, < tensorflow.python.keras.layers.core.Dense at 0x169ac31d0 >]""" transformer_model.layers[0].trainable = False input_ids = tf.keras.Input( shape=(Configuration['sampling']['max_sequence_size'], ), dtype='int32') attention_mask = tf.keras.Input( shape=(Configuration['sampling']['max_sequence_size'], ), dtype='int32') transformer = transformer_model([input_ids, attention_mask], training=True) hidden_states = transformer[0] # get output_hidden_states output = tf.keras.activations.sigmoid(hidden_states) self.model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output) self.model.compile(optimizer=Adam(lr=lr), loss=loss)
def __init__(self): """Possible states are 1. "await" (awaiting response) 2. "proceed" (proceed with the conversation)- used to give the bot control over the converation""" self._state="await" """Possible Flags are 1. "Exec" (task Executed) 2. "notExec" (proceed with the conversation)- used to give the bot control over the converation""" self._FLAG=None self._bert_base_case_mrpc_tokenizer=AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") self._bert_base_case_mrpc_model=TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") self._gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") self._gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=self._gpt2_tokenizer.eos_token_id) self.bert_large_uncased_whole_word_masking_finetuned_squad_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") self.bert_large_uncased_whole_word_masking_finetuned_squad_model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") self._DialoGP_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") self._DialoGP_model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium") self._conversation_started=False self._conversation_ended=True
def paraphrase(sequence_0, sequence_1): from transformers import AutoTokenizer, TFAutoModelForSequenceClassification import tensorflow as tf tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") model = TFAutoModelForSequenceClassification.from_pretrained( "bert-base-cased-finetuned-mrpc") classes = ["not paraphrase", "is paraphrase"] # sequence_0 = "The company HuggingFace is based in New York City" # sequence_1 = "Apples are especially bad for your health" sequence_2 = "HuggingFace's headquarters are situated in Manhattan" paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf") not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf") paraphrase_classification_logits = model(paraphrase)[0] not_paraphrase_classification_logits = model(not_paraphrase)[0] paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0] # not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0] # Should be paraphrase for i in range(len(classes)): print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%") # Should not be paraphrase # for i in range(len(classes)): # print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%") return
args, _ = parser.parse_known_args() # Set up logging logger = logging.getLogger(__name__) logging.basicConfig( level=logging.getLevelName("INFO"), handlers=[logging.StreamHandler(sys.stdout)], format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) strategy = tf.distribute.MirroredStrategy() with strategy.scope(): # Load model and tokenizer model = TFAutoModelForSequenceClassification.from_pretrained( args.model_name_or_path) tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) # Load dataset train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"]) # Preprocess train dataset train_dataset = train_dataset.map(lambda e: tokenizer( e["text"], truncation=True, padding="max_length"), batched=True) train_dataset.set_format( type="tensorflow", columns=["input_ids", "attention_mask", "label"]) train_features = {
from nltk.corpus import stopwords from data_utils import data_utils import os from transformers import TFBertForQuestionAnswering, BertTokenizer from transformers import TFAutoModelForSequenceClassification, AutoTokenizer import tensorflow as tf from rake_nltk import Rake import column_types import json from clauses import Clause from conditionmaps import conditions from nltk.stem import WordNetLemmatizer qa_model = TFBertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') qa_tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') rerank_model = TFAutoModelForSequenceClassification.from_pretrained('bert-large-cased-whole-word-masking') rerank_tokenizer = AutoTokenizer.from_pretrained('bert-large-cased-whole-word-masking') lemmatizer = WordNetLemmatizer() lem=lemmatizer.lemmatize stop_words = set(stopwords.words('english')) def extract_keywords_from_doc(doc, phrases=True, return_scores=False): if phrases: r = Rake() if isinstance(doc, (list, tuple)): r.extract_keywords_from_sentences(doc) else: r.extract_keywords_from_text(doc) if return_scores: return [(b, a) for a, b in r.get_ranked_phrases_with_scores()]
def _load_remote_model(self, model_name, tokenizer_kwargs, model_kwargs): do_lower_case = False if 'uncased' in model_name.lower(): do_lower_case = True tokenizer_kwargs.update({'do_lower_case': do_lower_case}) self._tokenizer = AutoTokenizer.from_pretrained( model_name, **tokenizer_kwargs) self._config = AutoConfig.from_pretrained(model_name) temporary_path = self._get_temporary_path() make_dir(temporary_path) # TensorFlow model try: self._model = TFAutoModelForSequenceClassification.from_pretrained( model_name, from_pt=False) # PyTorch model except TypeError: try: self._model = \ TFAutoModelForSequenceClassification.from_pretrained( model_name, from_pt=True ) # Loading a TF model from a PyTorch checkpoint is not supported # when using a model identifier name except OSError: model = AutoModel.from_pretrained(model_name) model.save_pretrained(temporary_path) self._model = \ TFAutoModelForSequenceClassification.from_pretrained( temporary_path, from_pt=True ) # Clean the model's last layer if the provided properties are different clean_last_layer = False for key, value in model_kwargs.items(): if not hasattr(self._model.config, key): clean_last_layer = True break if getattr(self._model.config, key) != value: clean_last_layer = True break if clean_last_layer: try: getattr( self._model, self._get_model_family()).save_pretrained(temporary_path) self._model = self._model.__class__.from_pretrained( temporary_path, from_pt=False, **model_kwargs) # The model is itself the main layer except AttributeError: # TensorFlow model try: self._model = self._model.__class__.from_pretrained( model_name, from_pt=False, **model_kwargs) # PyTorch Model except (OSError, TypeError): model = AutoModel.from_pretrained(model_name) model.save_pretrained(temporary_path) self._model = self._model.__class__.from_pretrained( temporary_path, from_pt=True, **model_kwargs) remove_dir(temporary_path) assert self._tokenizer and self._model
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, GlueDataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) try: num_labels = glue_tasks_num_labels["mnli" if data_args.task_name == "mnli-mm" else data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) with training_args.strategy.scope(): model = TFAutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( get_tfds( task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, data_dir=data_args.data_dir, ) if training_args.do_train else None ) eval_dataset = ( get_tfds( task_name=data_args.task_name, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, mode=Split.dev, data_dir=data_args.data_dir, ) if training_args.do_eval else None ) def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids) # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) results.update(result) return results
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) train_dataset, eval_dataset, test_ds, label2id = get_tfds( train_file=data_args.train_file, eval_file=data_args.dev_file, test_file=data_args.test_file, tokenizer=tokenizer, label_column_id=data_args.label_column_id, max_seq_length=data_args.max_seq_length, ) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=len(label2id), label2id=label2id, id2label={id: label for label, id in label2id.items()}, finetuning_task="text-classification", cache_dir=model_args.cache_dir, ) with training_args.strategy.scope(): model = TFAutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": (preds == p.label_ids).mean()} # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval: logger.info("*** Evaluate ***") result = trainer.evaluate() trainer.log_metrics("eval", result) trainer.save_metrics("eval", result) results.update(result) return results
def main(): parser = HfArgumentParser((GeneralArguments, ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. op_args, model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: op_args, model_args, data_args, training_args = parser.parse_args_into_dataclasses() if ( os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir ): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Global setup labels = op_args._labels() output_mode = op_args.output_mode if output_mode == 'classification': label2id = {label: i for i, label in enumerate(labels)} id2label = {v: k for k, v in label2id.items()} else: num_labels = 1 # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if output_mode == 'classification': config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, label2id=label2id, id2label=id2label, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) else: config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = ( YourDataset(data_args, op_args=op_args, tokenizer=tokenizer, mode=Split.train) if training_args.do_train else None ) eval_dataset = ( YourDataset(data_args, op_args=op_args, tokenizer=tokenizer, mode=Split.dev) if training_args.do_eval else None ) test_dataset = ( YourDataset(data_args, op_args=op_args, tokenizer=tokenizer, mode=Split.test) if training_args.do_predict else None ) trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=mode_compute_metrics(op_args.output_mode), ) # Training if training_args.do_train: trainer.train( model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None ) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Convert PT to TF if op_args.convert_to_tf: logger.info("***** Convert PT to TF {} *****".format(training_args.output_dir + 'pytorch_model.bin')) tf_model = TFAutoModelForSequenceClassification.from_pretrained(training_args.output_dir, from_pt=True) tf_model.save_pretrained(training_args.output_dir) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle double evaluation (matched, mis-matched) eval_datasets = [eval_dataset] for eval_dataset in eval_datasets: eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join( training_args.output_dir, f"eval_results_{eval_dataset.args.task_name}.txt" ) if trainer.is_world_master(): with open(output_eval_file, "w") as writer: logger.info("***** Eval results {} *****".format(eval_dataset.args.task_name)) for key, value in eval_result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) eval_results.update(eval_result) if training_args.do_predict: logging.info("*** Test ***") test_datasets = [test_dataset] for test_dataset in test_datasets: predictions = trainer.predict(test_dataset=test_dataset).predictions if output_mode == "classification": predictions = np.argmax(predictions, axis=1) output_test_file = os.path.join( training_args.output_dir, f"test_results_{test_dataset.args.task_name}.txt" ) if trainer.is_world_master(): with open(output_test_file, "w") as writer: logger.info("***** Test results {} *****".format(test_dataset.args.task_name)) writer.write("index\tprediction\n") for index, item in enumerate(predictions): if output_mode == "regression": writer.write("%d\t%3.3f\n" % (index, item)) else: item = test_dataset.get_labels()[item] writer.write("%d\t%s\n" % (index, item)) return eval_results
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True) small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select( range(104 * 3)) full_eval_dataset = tokenized_datasets["test"] import tensorflow as tf from transformers import TFAutoModelForSequenceClassification from datetime import datetime gpus = tf.config.experimental.list_physical_devices('GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2) tf_eval_dataset = small_eval_dataset.remove_columns( ["text"]).with_format("tensorflow") eval_features = { x: tf_eval_dataset[x].to_tensor() for x in tokenizer.model_input_names } eval_tf_dataset = tf.data.Dataset.from_tensor_slices( (eval_features, tf_eval_dataset["label"])) eval_tf_dataset = eval_tf_dataset.batch(8) model.compile( optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
def __call__(self, text_inputs): raw_outputs = self.pipeline(text_inputs) outputs = [] for output in raw_outputs: score = output["score"] if output["label"] == "POSITIVE": outputs.append([1 - score, score]) else: outputs.append([score, 1 - score]) return np.array(outputs) # Create the model: a French sentiment analysis model. # see https://github.com/TheophileBlard/french-sentiment-analysis-with-bert model = TFAutoModelForSequenceClassification.from_pretrained("tblard/tf-allocine") tokenizer = AutoTokenizer.from_pretrained("tblard/tf-allocine") pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer) model_wrapper = HuggingFaceSentimentAnalysisPipelineWrapper(pipeline) # Create the recipe: PWWS uses a WordNet transformation. recipe = PWWSRen2019.build(model_wrapper) # WordNet defaults to english. Set the default language to French ('fra') # # See # "Building a free French wordnet from multilingual resources", # E. L. R. A. (ELRA) (ed.), # Proceedings of the Sixth International Language Resources and Evaluation (LREC’08). recipe.transformation.language = "fra"
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if not (training_args.do_train or training_args.do_eval or training_args.do_predict): exit( "Must specify at least one of --do_train, --do_eval or --do_predict!" ) # endregion # region Checkpoints checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: checkpoint = get_last_checkpoint(training_args.output_dir) if checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Dataset and labels # Set seed before initializing model. set_seed(training_args.seed) # Downloading and loading a dataset from the hub. In distributed training, the load_dataset function guarantee # that only one local process can concurrently download the dataset. datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 if data_args.predict_file is not None: logger.info("Preparing user-supplied file for predictions...") data_files = {"data": data_args.predict_file} for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.predict_file.endswith(".csv"): # Loading a dataset from local csv files user_dataset = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir) else: # Loading a dataset from local json files user_dataset = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) needed_keys = task_to_keys[data_args.task_name] for key in needed_keys: assert key in user_dataset[ "data"].features, f"Your supplied predict_file is missing the {key} key!" datasets["user_data"] = user_dataset["data"] # endregion # region Load model config and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing sentence1_key, sentence2_key = task_to_keys[data_args.task_name] non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if config.label2id != PretrainedConfig( num_labels=num_labels).label2id and not is_regression: # Some have all caps in their config, some don't. label_name_to_id = {k.lower(): v for k, v in config.label2id.items()} if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: int(label_name_to_id[label_list[i]]) for i in range(num_labels) } else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) label_to_id = {label: i for i, label in enumerate(label_list)} if label_to_id is not None: config.label2id = label_to_id config.id2label = {id: label for label, id in config.label2id.items()} elif data_args.task_name is not None and not is_regression: config.label2id = {l: i for i, l in enumerate(label_list)} config.id2label = {id: label for label, id in config.label2id.items()} if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) # endregion # region Metric function metric = load_metric("glue", data_args.task_name) def compute_metrics(preds, label_ids): preds = preds["logits"] preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) result = metric.compute(predictions=preds, references=label_ids) if len(result) > 1: result["combined_score"] = np.mean(list(result.values())).item() return result # endregion with training_args.strategy.scope(): # region Load pretrained model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Optimizer, loss and compilation optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) if is_regression: loss_fn = tf.keras.losses.MeanSquaredError() metrics = [] else: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True) metrics = ["accuracy"] model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) # endregion # region Convert data to a tf.data.Dataset tf_data = dict() if isinstance( training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length: logger.info( "Padding all batches to max length because argument was set or we're on TPU." ) dataset_mode = "constant_batch" else: dataset_mode = "variable_batch" max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_eval_samples, "validation_matched": data_args.max_eval_samples, "validation_mismatched": data_args.max_eval_samples, "test": data_args.max_predict_samples, "test_matched": data_args.max_predict_samples, "test_mismatched": data_args.max_predict_samples, "user_data": None, } for key in datasets.keys(): if key == "train" or key.startswith("validation"): assert "label" in datasets[ key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size drop_remainder = True # Saves us worrying about scaling gradients for the last batch else: shuffle = False batch_size = training_args.per_device_eval_batch_size drop_remainder = False samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) data = convert_dataset_for_tensorflow( dataset, non_label_column_names, batch_size=batch_size, dataset_mode=dataset_mode, drop_remainder=drop_remainder, shuffle=shuffle, ) tf_data[key] = data # endregion # region Training and validation if training_args.do_train: callbacks = [ SavePretrainedCallback(output_dir=training_args.output_dir) ] if training_args.do_eval and not data_args.task_name == "mnli": # Do both evaluation and training in the Keras fit loop, unless the task is MNLI # because MNLI has two validation sets validation_data = tf_data["validation"] else: validation_data = None model.fit( tf_data["train"], validation_data=validation_data, epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) # endregion # region Evaluation if training_args.do_eval: # We normally do validation as part of the Keras fit loop, but we run it independently # if there was no fit() step (because we didn't train the model) or if the task is MNLI, # because MNLI has a separate validation-mismatched validation set logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) if data_args.task_name == "mnli": tasks = ["mnli", "mnli-mm"] tf_datasets = [ tf_data["validation_matched"], tf_data["validation_mismatched"] ] raw_datasets = [ datasets["validation_matched"], datasets["validation_mismatched"] ] else: tasks = [data_args.task_name] tf_datasets = [tf_data["validation"]] raw_datasets = [datasets["validation"]] for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): eval_predictions = model.predict(tf_dataset) eval_metrics = compute_metrics(eval_predictions, raw_dataset["label"]) print(f"Evaluation metrics ({task}):") print(eval_metrics) # endregion # region Prediction if training_args.do_predict or data_args.predict_file: logger.info("*** Predict ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [] tf_datasets = [] raw_datasets = [] if training_args.do_predict: if data_args.task_name == "mnli": tasks.extend(["mnli", "mnli-mm"]) tf_datasets.extend( [tf_data["test_matched"], tf_data["test_mismatched"]]) raw_datasets.extend([ datasets["test_matched"], datasets["test_mismatched"] ]) else: tasks.append(data_args.task_name) tf_datasets.append(tf_data["test"]) raw_datasets.append(datasets["test"]) if data_args.predict_file: tasks.append("user_data") tf_datasets.append(tf_data["user_data"]) raw_datasets.append(datasets["user_data"]) for raw_dataset, tf_dataset, task in zip(raw_datasets, tf_datasets, tasks): test_predictions = model.predict(tf_dataset) if "label" in raw_dataset: test_metrics = compute_metrics(test_predictions, raw_dataset["label"]) print(f"Test metrics ({task}):") print(test_metrics) if is_regression: predictions_to_write = np.squeeze( test_predictions["logits"]) else: predictions_to_write = np.argmax( test_predictions["logits"], axis=1) output_predict_file = os.path.join( training_args.output_dir, f"predict_results_{task}.txt") with open(output_predict_file, "w") as writer: logger.info( f"***** Writing prediction results for {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions_to_write): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = model.config.id2label[item] writer.write(f"{index}\t{item}\n")
def save_tf_model_from_transformers(): model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english") callable = tf.function(model.call) concrete_function = callable.get_concrete_function([tf.TensorSpec([None, MAX_SEQ_LEN], tf.int32, name="input_ids"), tf.TensorSpec([None, MAX_SEQ_LEN], tf.int32, name="attention_mask")]) model.save('saved_model/distilbert/1', signatures=concrete_function)
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The # information sent is the one passed as arguments along with your Python/PyTorch versions. send_example_telemetry("run_text_classification", model_args, data_args, framework="tensorflow") output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Checkpoints # Detecting last checkpoint. checkpoint = None if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO) logger.info(f"Training/evaluation parameters {training_args}") # endregion # region Loading data # For CSV/JSON files, this script will use the 'label' field as the label and the 'sentence1' and optionally # 'sentence2' fields as inputs if they exist. If not, the first two fields not named label are used if at least two # columns are provided. Note that the term 'sentence' can be slightly misleading, as they often contain more than # a single grammatical sentence, when the task requires it. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. data_files = {"train": data_args.train_file, "validation": data_args.validation_file, "test": data_args.test_file} data_files = {key: file for key, file in data_files.items() if file is not None} for key in data_files.keys(): logger.info(f"Loading a local file for {key}: {data_files[key]}") if data_args.input_file_extension == "csv": # Loading a dataset from local csv files datasets = load_dataset( "csv", data_files=data_files, cache_dir=model_args.cache_dir, use_auth_token=True if model_args.use_auth_token else None, ) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Label preprocessing # If you've passed us a training set, we try to infer your labels from it if "train" in datasets: # By default we assume that if your label column looks like a float then you're doing regression, # and if not then you're doing classification. This is something you may want to change! is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # If you haven't passed a training set, we read label info from the saved model (this happens later) else: num_labels = None label_list = None is_regression = None # endregion # region Load model config and tokenizer if checkpoint is not None: config_path = training_args.output_dir elif model_args.config_name: config_path = model_args.config_name else: config_path = model_args.model_name_or_path if num_labels is not None: config = AutoConfig.from_pretrained( config_path, num_labels=num_labels, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) else: config = AutoConfig.from_pretrained( config_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Dataset preprocessing # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. column_names = {col for cols in datasets.column_names.values() for col in cols} non_label_column_names = [name for name in column_names if name != "label"] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" elif "sentence1" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", None else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Ensure that our labels match the model's, if it has some pre-specified if "train" in datasets: if not is_regression and config.label2id != PretrainedConfig(num_labels=num_labels).label2id: label_name_to_id = config.label2id if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = label_name_to_id # Use the model's labels else: logger.warning( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels:" f" {list(sorted(label_list))}.\nIgnoring the model labels as a result.", ) label_to_id = {v: i for i, v in enumerate(label_list)} elif not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} else: label_to_id = None # Now we've established our label2id, let's overwrite the model config with it. config.label2id = label_to_id if config.label2id is not None: config.id2label = {id: label for label, id in label_to_id.items()} else: config.id2label = None else: label_to_id = config.label2id # Just load the data from the model if "validation" in datasets and config.label2id is not None: validation_label_list = datasets["validation"].unique("label") for val_label in validation_label_list: assert val_label in label_to_id, f"Label {val_label} is in the validation set but not the training set!" def preprocess_function(examples): # Tokenize the texts args = ( (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key]) ) result = tokenizer(*args, max_length=max_seq_length, truncation=True) # Map labels to IDs if config.label2id is not None and "label" in examples: result["label"] = [(config.label2id[l] if l != -1 else -1) for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) if data_args.pad_to_max_length: data_collator = DefaultDataCollator(return_tensors="tf") else: data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf") # endregion with training_args.strategy.scope(): # region Load pretrained model # Set seed before initializing model set_seed(training_args.seed) # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForSequenceClassification.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Optimizer, loss and compilation optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) if is_regression: loss_fn = tf.keras.losses.MeanSquaredError() metrics = [] else: loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metrics = ["accuracy"] model.compile(optimizer=optimizer, loss=loss_fn, metrics=metrics) # endregion # region Convert data to a tf.data.Dataset tf_data = dict() max_samples = { "train": data_args.max_train_samples, "validation": data_args.max_val_samples, "test": data_args.max_test_samples, } for key in ("train", "validation", "test"): if key not in datasets: tf_data[key] = None continue if key in ("train", "validation"): assert "label" in datasets[key].features, f"Missing labels from {key} data!" if key == "train": shuffle = True batch_size = training_args.per_device_train_batch_size drop_remainder = True # Saves us worrying about scaling gradients for the last batch else: shuffle = False batch_size = training_args.per_device_eval_batch_size drop_remainder = False samples_limit = max_samples[key] dataset = datasets[key] if samples_limit is not None: dataset = dataset.select(range(samples_limit)) data = dataset.to_tf_dataset( columns=[col for col in dataset.column_names if col not in set(non_label_column_names + ["label"])], shuffle=shuffle, batch_size=batch_size, collate_fn=data_collator, drop_remainder=drop_remainder, # `label_cols` is needed for user-defined losses, such as in this example label_cols="label" if "label" in dataset.column_names else None, ) tf_data[key] = data # endregion # region Training and validation if tf_data["train"] is not None: callbacks = [SavePretrainedCallback(output_dir=training_args.output_dir)] model.fit( tf_data["train"], validation_data=tf_data["validation"], epochs=int(training_args.num_train_epochs), callbacks=callbacks, ) elif tf_data["validation"] is not None: # If there's a validation dataset but no training set, just evaluate the metrics logger.info("Computing metrics on validation data...") if is_regression: loss = model.evaluate(tf_data["validation"]) logger.info(f"Loss: {loss:.5f}") else: loss, accuracy = model.evaluate(tf_data["validation"]) logger.info(f"Loss: {loss:.5f}, Accuracy: {accuracy * 100:.4f}%") # endregion # region Prediction if tf_data["test"] is not None: logger.info("Doing predictions on test dataset...") predictions = model.predict(tf_data["test"])["logits"] predicted_class = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: writer.write("index\tprediction\n") for index, item in enumerate(predicted_class): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = config.id2label[item] writer.write(f"{index}\t{item}\n") logger.info(f"Wrote predictions to {output_test_file}!") # endregion # region Prediction losses # This section is outside the scope() because it's very quick to compute, but behaves badly inside it if "test" in datasets and "label" in datasets["test"].features: print("Computing prediction loss on test labels...") labels = datasets["test"]["label"] loss = float(loss_fn(labels, predictions).numpy()) print(f"Test loss: {loss:.4f}")
def main(): parser = argparse.ArgumentParser() # Hyperparameters sent by the client are passed as command-line arguments to the script. parser.add_argument("--epochs", type=int, default=3) parser.add_argument("--train_batch_size", type=int, default=8) parser.add_argument("--eval_batch_size", type=int, default=4) parser.add_argument("--model_name_or_path", type=str) parser.add_argument("--learning_rate", type=str, default=5e-5) parser.add_argument("--do_train", type=bool, default=True) parser.add_argument("--do_eval", type=bool, default=True) # Data, model, and output directories parser.add_argument("--output_data_dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"]) parser.add_argument("--model_dir", type=str, default=os.environ["SM_MODEL_DIR"]) parser.add_argument("--n_gpus", type=str, default=os.environ["SM_NUM_GPUS"]) args, _ = parser.parse_known_args() # Set up logging logger = logging.getLogger(__name__) logging.basicConfig( level=logging.getLevelName("INFO"), handlers=[logging.StreamHandler(sys.stdout)], format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) logger.info(args) # # Preprocesssing # # load tokenizer tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) # Load dataset train_dataset, test_dataset = load_dataset("imdb", split=["train", "test"]) # Preprocess train dataset train_dataset = train_dataset.map( lambda e: tokenizer(e["text"], truncation=True, padding="longest"), batched=True) train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) train_features = { x: train_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) for x in ["input_ids", "attention_mask"] } tf_train_dataset = tf.data.Dataset.from_tensor_slices( (train_features, train_dataset["label"])).batch(args.train_batch_size) # Preprocess test dataset test_dataset = test_dataset.map( lambda e: tokenizer(e["text"], truncation=True, padding="max_length"), batched=True) test_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "label"]) test_features = { x: test_dataset[x].to_tensor(default_value=0, shape=[None, tokenizer.model_max_length]) for x in ["input_ids", "attention_mask"] } tf_test_dataset = tf.data.Dataset.from_tensor_slices( (test_features, test_dataset["label"])).batch(args.eval_batch_size) # # Training # # Load model # implementation is based on the horovod implementation in combination with sagemaker # https://horovod.readthedocs.io/en/stable/keras.html # adjust optimizer # https://sagemaker.readthedocs.io/en/stable/api/training/sdp_versions/smd_data_parallel_tensorflow.html#smdistributed.dataparallel.tensorflow.DistributedOptimizer learning_rate = args.learning_rate * hvd.size() optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate) optimizer = hvd.DistributedOptimizer(optimizer) # fine optimizer and loss model = TFAutoModelForSequenceClassification.from_pretrained( args.model_name_or_path) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metrics = [tf.keras.metrics.SparseCategoricalAccuracy()] # Specify `experimental_run_tf_function=False` to ensure TensorFlow # uses hvd.DistributedOptimizer() to compute gradients. model.compile(optimizer=optimizer, loss=loss, metrics=metrics, experimental_run_tf_function=False) # callbacks # https://horovod.readthedocs.io/en/stable/api.html#horovod.tensorflow.keras.callbacks.BroadcastGlobalVariablesCallback BroadcastGlobalVariablesCallback = hvd.callbacks.BroadcastGlobalVariablesCallback callbacks = [ # broadcast initial variable states from rank 0 to all other processes. # This is necessary to ensure consistent initialization of all workers when # training is started with random weights or restored from a checkpoint. BroadcastGlobalVariablesCallback(0), ] # save checkpoints only on worker 0 to prevent other workers from corrupting them. # if hvd.rank() == 0: # callbacks.append(tf.keras.callbacks.ModelCheckpoint("./checkpoint-{epoch}.h5")) # Training if args.do_train: logger.info("*** Train ***") start_time = time.time() # batch_size https://github.com/horovod/horovod/issues/1617 # will be spread across all devices equally so. E.g.: train_batch_size 8, n_gpus 4 === 32 train_results = model.fit( tf_train_dataset, epochs=args.epochs, # steps_per_epoch=500 // hvd.size(), callbacks=callbacks, validation_batch_size=args.eval_batch_size, batch_size=args.train_batch_size, verbose=1 if hvd.rank() == 0 else 0, ) train_runtime = {f"train_runtime": round(time.time() - start_time, 4)} logger.info(f"train_runtime = {train_runtime}\n") output_train_file = os.path.join(args.output_data_dir, "train_results.txt") with open(output_train_file, "w") as writer: logger.info("***** Train results *****") logger.info(train_results) for key, value in train_results.history.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) writer.write(f"train_runtime = {train_runtime}\n") # Evaluation if args.do_eval: logger.info("*** Evaluate ***") result = model.evaluate(tf_test_dataset, batch_size=args.eval_batch_size, return_dict=True) output_eval_file = os.path.join(args.output_data_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") logger.info(result) for key, value in result.items(): logger.info(" %s = %s", key, value) writer.write("%s = %s\n" % (key, value)) # save checkpoints only on worker 0 to prevent other workers from corrupting them. model.save_pretrained(args.model_dir) tokenizer.save_pretrained(args.model_dir)