def load_qa_from_pretrained( model: Optional[tf.keras.Model] = None, name: Optional[str] = None, path: Optional[str] = None, # path to checkpoint from TF...ForPreTraining config: Optional[PretrainedConfig] = None, ) -> tf.keras.Model: """ Load a TF...QuestionAnswering model by taking the main layer of a pretrained model. Preserves the model.config attribute. """ assert (bool(name) ^ bool(model) ^ (bool(path) and bool(config)) ), "Pass either name, model, or (path and config)" if name is not None: return TFAutoModelForQuestionAnswering.from_pretrained(name) elif model is not None: pretrained_model = model elif path is not None: pretrained_model = TFAutoModelForPreTraining.from_config(config) pretrained_model.load_weights(path) qa_model = TFAutoModelForQuestionAnswering.from_config( pretrained_model.config) pretrained_main_layer = getattr(pretrained_model, qa_model.base_model_prefix) assert ( pretrained_main_layer is not None ), f"{pretrained_model} has no attribute '{model.base_model_prefix}'" # Generalized way of saying `model.albert = pretrained_model.albert` setattr(qa_model, qa_model.base_model_prefix, pretrained_main_layer) return qa_model
def __init__(self, bert_squad_model='bert-large-uncased-whole-word-masking-finetuned-squad', bert_emb_model='bert-base-uncased'): self.model_name = bert_squad_model try: self.model = TFAutoModelForQuestionAnswering.from_pretrained(self.model_name) except: self.model = TFAutoModelForQuestionAnswering.from_pretrained(self.model_name, from_pt=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.maxlen = 512 self.te = tpp.TransformerEmbedding(bert_emb_model, layers=[-2])
def __init__( self, model_name=DEFAULT_MODEL, bert_squad_model=None, bert_emb_model="bert-base-uncased", framework="tf", device=None, quantize=False, ): model_name = bert_squad_model if bert_squad_model is not None else model_name if bert_squad_model: warnings.warn( "The bert_squad_model argument is deprecated - please use model_name instead.", DeprecationWarning, stacklevel=2, ) self.model_name = model_name self.framework = framework if framework == "tf": try: import tensorflow as tf except ImportError: raise Exception('If framework=="tf", TensorFlow must be installed.') try: self.model = TFAutoModelForQuestionAnswering.from_pretrained( self.model_name ) except: warnings.warn( "Could not load supplied model as TensorFlow checkpoint - attempting to load using from_pt=True" ) self.model = TFAutoModelForQuestionAnswering.from_pretrained( self.model_name, from_pt=True ) else: bert_emb_model = ( None # set to None and ignore since we only want to use PyTorch ) super().__init__(device=device, quantize=quantize) self.model = AutoModelForQuestionAnswering.from_pretrained( self.model_name ).to(self.torch_device) if quantize: self.model = self.quantize_model(self.model) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.maxlen = 512 self.te = ( tpp.TransformerEmbedding(bert_emb_model, layers=[-2]) if bert_emb_model is not None else None )
def semantic_score(sentence, reference): """ return: string containing the answer """ tokenizer = AutoTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") model = TFAutoModelForQuestionAnswering.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True) inputs = tokenizer(sentence, reference, add_special_tokens=True, return_tensors="tf") input_ids = inputs["input_ids"].numpy()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) output = model(inputs) answer_start = tf.argmax( output.start_logits, axis=1 ).numpy()[0] start = output.start_logits[:, answer_start].numpy()[0] print('start:', answer_start, start) answer_end = ( tf.argmax(output.end_logits, axis=1) + 1 ).numpy()[0] end = output.start_logits[:, answer_end].numpy()[0] print('end:', answer_end, end) return start, end
def load(self): self.tokenizer = AutoTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") self.model = TFAutoModelForQuestionAnswering.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") self.ready = True
def __init__( self, model_name='bert-large-uncased-whole-word-masking-finetuned-squad' ): self.model_name = model_name self.model = TFAutoModelForQuestionAnswering.from_pretrained( self.model_name) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
def __init__(self, bert_squad_model=DEFAULT_MODEL, bert_emb_model='bert-base-uncased'): self.model_name = bert_squad_model try: self.model = TFAutoModelForQuestionAnswering.from_pretrained( self.model_name) except: warnings.warn( 'Could not load supplied model as TensorFlow checkpoint - attempting to load using from_pt=True' ) self.model = TFAutoModelForQuestionAnswering.from_pretrained( self.model_name, from_pt=True) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) self.maxlen = 512 self.te = tpp.TransformerEmbedding(bert_emb_model, layers=[ -2 ]) if bert_emb_model is not None else None
def save_tf_model_from_transformers(): model = TFAutoModelForQuestionAnswering.from_pretrained( "distilbert-base-cased-distilled-squad") callable = tf.function(model.call) concrete_function = callable.get_concrete_function([ tf.TensorSpec([None, MAX_SEQ_LEN], tf.int32, name="input_ids"), tf.TensorSpec([None, MAX_SEQ_LEN], tf.int32, name="attention_mask") ]) model.save('saved_model/distilbert_qa/1', signatures=concrete_function)
def load_model(self, model_name, model_path, model_type): logger.info(">> Loading HF model " + model_name + " from " + model_path) self.type = model_type self.name = model_name self.tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True) self.model = TFAutoModelForQuestionAnswering.from_pretrained( model_path, from_pt=True)
def test_question_answering_model_from_pretrained(self): # for model_name in TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = TFAutoModelForQuestionAnswering.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForQuestionAnswering)
def test_question_answering_model_from_pretrained(self): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ["bert-base-uncased"]: config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) model = TFAutoModelForQuestionAnswering.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForQuestionAnswering)
def semantic_search(self, sentence): """ performs semantic search on a corpus of documents. Args: corpus_path: (str) the path to the corpus of reference documents on which to perform semantic search. sentence: (str) the sentence from which to perform semantic search. Returns: (str) the reference text of the document most similar to sentence. """ url = 'bert-large-uncased-whole-word-masking-finetuned-squad' tokenizer = BertTokenizer.from_pretrained(url) model = TFAutoModelForQuestionAnswering.from_pretrained( url, return_dict=True) filelist = os.listdir(self.corpus_path) maximo = None final_file = None for file in filelist: path_file = self.corpus_path + "/" + file if os.path.isfile(path_file): with open(path_file, 'rb') as f: reference = f.read().decode(errors='replace') inputs = tokenizer(sentence, reference, add_special_tokens=True, return_tensors="tf") input_ids = inputs["input_ids"].numpy()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) output = model(inputs) answer_start = tf.argmax(output.start_logits, axis=1).numpy()[0] answer_end = (tf.argmax(output.end_logits, axis=1) + 1).numpy()[0] first = output.start_logits[:, answer_start].numpy()[0] last = output.start_logits[:, answer_end].numpy()[0] if maximo is None: maximo = (first + last) / 2 final_file = path_file elif (maximo < ((first + last) / 2)): maximo = (first + last) / 2 final_file = path_file with open(final_file, 'rb') as f: result = f.read().decode(errors='replace') return result
def load(self): #f = open('imagenet_classes.txt') #self.classes = [line.strip() for line in f.readlines()] #model = models.alexnet(pretrained=True) #model.eval() #self.model = model self.tokenizer = AutoTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") self.model = TFAutoModelForQuestionAnswering.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") self.ready = True
def camemBert(context, question): tokenizer = AutoTokenizer.from_pretrained('camembert-base') model = TFAutoModelForQuestionAnswering.from_pretrained("camembert-base") inputs = tokenizer.encode_plus( question, context, add_special_tokens=True, return_tensors="tf") # The .numpy() method explicitly converts a Tensor to a numpy array input_ids = inputs["input_ids"].numpy()[0] #text_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer_start_scores, answer_end_scores = model(inputs) # Get the most likely beginning of answer with the argmax of the score answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0] # Get the most likely end of answer with the argmax of the score,+1 cause in the list indexing the upper bound isn't included answer_end = (tf.argmax(answer_end_scores, axis=1)+1).numpy()[0] answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) return answer
def distilBERT(context, question): #tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') tokenizer = AutoTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") #model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased', return_dict=True) model = TFAutoModelForQuestionAnswering.from_pretrained( "distilbert-base-uncased") inputs = tokenizer.encode_plus( question, context, add_special_tokens=True, return_tensors="tf") # The .numpy() method explicitly converts a Tensor to a numpy array input_ids = inputs["input_ids"].numpy()[0] #text_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer_start_scores, answer_end_scores = model(inputs) # Get the most likely beginning of answer with the argmax of the score answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0] # Get the most likely end of answer with the argmax of the score,+1 cause in the list indexing the upper bound isn't included answer_end = (tf.argmax(answer_end_scores, axis=1)+1).numpy()[0] answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) return answer
def comprehesion(text, questions): from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering import tensorflow as tf tokenizer = AutoTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") model = TFAutoModelForQuestionAnswering.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") # text = r""" # 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose # architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural # Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between # TensorFlow 2.0 and PyTorch. # """ # questions = [ # "How many pretrained models are available in 🤗 Transformers?", # "What does 🤗 Transformers provide?", # "🤗 Transformers provides interoperability between which frameworks?", # ] for question in questions: inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf") input_ids = inputs["input_ids"].numpy()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) outputs = model(inputs) answer_start_scores = outputs.start_logits answer_end_scores = outputs.end_logits answer_start = tf.argmax(answer_start_scores, axis=1).numpy( )[0] # Get the most likely beginning of answer with the argmax of the score answer_end = (tf.argmax(answer_end_scores, axis=1) + 1).numpy( )[0] # Get the most likely end of answer with the argmax of the score answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens( input_ids[answer_start:answer_end])) print(f"Question: {question}") print(f"Answer: {answer}") return
def question_answer(question, reference): """ question: string containing the question to answer """ tokenizer = AutoTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") model = TFAutoModelForQuestionAnswering.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True) inputs = tokenizer(question, reference, add_special_tokens=True, return_tensors="tf") input_ids = inputs["input_ids"].numpy()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) output = model(inputs) answer_start = tf.argmax(output.start_logits, axis=1).numpy()[0] answer_end = (tf.argmax(output.end_logits, axis=1) + 1).numpy()[0] answer = tokenizer.convert_tokens_to_string( tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])) return answer
def __init__(self): """Possible states are 1. "await" (awaiting response) 2. "proceed" (proceed with the conversation)- used to give the bot control over the converation""" self._state="await" """Possible Flags are 1. "Exec" (task Executed) 2. "notExec" (proceed with the conversation)- used to give the bot control over the converation""" self._FLAG=None self._bert_base_case_mrpc_tokenizer=AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc") self._bert_base_case_mrpc_model=TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc") self._gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") self._gpt2_model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=self._gpt2_tokenizer.eos_token_id) self.bert_large_uncased_whole_word_masking_finetuned_squad_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") self.bert_large_uncased_whole_word_masking_finetuned_squad_model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad") self._DialoGP_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium") self._DialoGP_model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium") self._conversation_started=False self._conversation_ended=True
def copy_model_files(self, force=False): modified = False src_path = self.checkpoint_path d = None try: if force or not (self.git_path / "tf_model.h5").exists() or not ( self.git_path / "pytorch_model.bin").exists(): d = TemporaryDirectory() if self.task in self.QA_TASKS: model = QASparseXP.compile_model(src_path, dest_path=d.name) elif self.task in self.GLUE_TASKS: model = GlueSparseXP.compile_model(src_path, dest_path=d.name) elif self.task in self.SUMMARIZATION_TASKS: model = SummarizationSparseXP.compile_model( src_path, dest_path=d.name) else: raise Exception(f"Unknown task {self.task}") model = optimize_model(model, "heads") model.save_pretrained(d.name) src_path = d.name if force or not (self.git_path / "tf_model.h5").exists(): with TemporaryDirectory() as d2: if self.task in self.QA_TASKS: QASparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForQuestionAnswering.from_pretrained( d2, from_pt=True) elif self.task in self.GLUE_TASKS: GlueSparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForSequenceClassification.from_pretrained( d2, from_pt=True) elif self.task in self.SUMMARIZATION_TASKS: SummarizationSparseXP.final_fine_tune_bertarize( src_path, d2, remove_head_pruning=True) tf_model = TFAutoModelForSeq2SeqLM.from_pretrained( d2, from_pt=True) else: raise Exception(f"Unknown task {self.task}") tf_model.save_pretrained(self.git_path) modified = True if force or not (self.git_path / "pytorch_model.bin").exists(): if self.task in self.QA_TASKS: model = AutoModelForQuestionAnswering.from_pretrained( src_path) elif self.task in self.GLUE_TASKS: model = AutoModelForSequenceClassification.from_pretrained( src_path) elif self.task in self.SUMMARIZATION_TASKS: model = AutoModelForSeq2SeqLM.from_pretrained(src_path) else: raise Exception(f"Unknown task {self.task}") model.save_pretrained(self.git_path) modified = True src_path = Path(src_path) to_copy = self.get_copy_list() for files, dest in to_copy: dest.mkdir(exist_ok=True) for file in files: if force or not (dest / file).exists(): shutil.copyfile(str(src_path / file), str(dest / file)) modified = True finally: if d is not None: d.cleanup() # Reload the config, this may have been changed by compilation / optimization (pruned_heads, gelu_patch, layer_norm_patch) with (self.git_path / "config.json").open() as f: self.checkpoint_info["config"] = json.load(f) return modified
def run_squad_and_get_results( run_name: str, fsx_prefix: str, pre_layer_norm: bool, model_size: str, load_from: Union[str, tf.keras.Model], load_step: int, batch_size: int, checkpoint_frequency: Optional[int], validate_frequency: Optional[int], learning_rate: float, warmup_steps: int, total_steps: int, dataset: str, dummy_eval: bool = False, config: Optional[PretrainedConfig] = None, ) -> Dict: checkpoint_frequency = checkpoint_frequency or 1000000 validate_frequency = validate_frequency or 1000000 if isinstance(load_from, tf.keras.Model): config = load_from.config assert config is not None, "config may not be None" # Instantiate QuestionAnswering model if isinstance(load_from, TFPreTrainedModel): model = load_qa_from_pretrained(model=load_from) elif load_from == "scratch": model = TFAutoModelForQuestionAnswering.from_config(config) elif load_from == "huggingface": model = load_qa_from_pretrained(name=f"albert-{model_size}-v2") else: raise ValueError( f"'load_from' is '{load_from}'; must be in ['scratch', 'huggingface', 'amazon']" ) tokenizer = get_tokenizer() schedule = LinearWarmupPolyDecaySchedule( max_learning_rate=learning_rate, end_learning_rate=0, warmup_steps=warmup_steps, total_steps=total_steps, ) optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale="dynamic" ) # AMP model.call = wrap_tf_function_idempotent(model.call) if dataset == "squadv1": train_filename = "train-v1.1.json" val_filename = "dev-v1.1.json" processor = SquadV1Processor() elif dataset == "squadv2": train_filename = "train-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() elif dataset == "debug": train_filename = "dev-v2.0.json" val_filename = "dev-v2.0.json" processor = SquadV2Processor() else: assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']" data_dir = f"{fsx_prefix}/squad_data" train_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=train_filename, batch_size=batch_size, shard=True, shuffle=True, repeat=True, drop_remainder=True, ) if hvd.rank() == 0: print("Starting finetuning") pbar = tqdm.tqdm(total_steps) summary_writer = None # Only create a writer if we make it through a successful step val_dataset = get_dataset( tokenizer=tokenizer, processor=processor, data_dir=data_dir, filename=val_filename, batch_size=batch_size, shard=False, shuffle=True, drop_remainder=False, ) # Need to re-wrap every time this function is called # Wrapping train_step gives an error with optimizer initialization on the second pass # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875 # Discussion at https://github.com/tensorflow/tensorflow/issues/27120 wrapped_train_step = tf.function(train_step) for step, batch in enumerate(train_dataset): learning_rate = schedule(step=tf.constant(step, dtype=tf.float32)) loss, acc, exact_match, f1, precision, recall = wrapped_train_step( model=model, optimizer=optimizer, batch=batch ) # Broadcast model after the first step so parameters and optimizer are initialized if step == 0: hvd.broadcast_variables(model.variables, root_rank=0) hvd.broadcast_variables(optimizer.variables(), root_rank=0) is_final_step = step >= total_steps - 1 if hvd.rank() == 0: do_checkpoint = (step % checkpoint_frequency == 0) or is_final_step do_validate = (step % validate_frequency == 0) or is_final_step pbar.update(1) description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}" pbar.set_description(description) if do_validate: print("Running validation") ( val_loss, val_acc, val_exact_match, val_f1, val_precision, val_recall, ) = run_validation(model=model, val_dataset=val_dataset) description = ( f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, " f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}" ) print(description) print("Running evaluation") if dummy_eval: results = { "exact": 0.8169797018445212, "f1": 4.4469722448269335, "total": 11873, "HasAns_exact": 0.15182186234817813, "HasAns_f1": 7.422216845956518, "HasAns_total": 5928, "NoAns_exact": 1.4802354920100924, "NoAns_f1": 1.4802354920100924, "NoAns_total": 5945, "best_exact": 50.07159100480081, "best_exact_thresh": 0.0, "best_f1": 50.0772059855695, "best_f1_thresh": 0.0, } else: results: Dict = get_evaluation_metrics( model=model, data_dir=data_dir, filename=val_filename, batch_size=32, ) print_eval_metrics(results=results, step=step) if do_checkpoint: checkpoint_path = ( f"{fsx_prefix}/checkpoints/albert-squad/{run_name}-step{step}.ckpt" ) print(f"Saving checkpoint at {checkpoint_path}") model.save_weights(checkpoint_path) if summary_writer is None: summary_writer = tf.summary.create_file_writer( f"{fsx_prefix}/logs/albert-squad/{run_name}" ) with summary_writer.as_default(): tf.summary.scalar("learning_rate", learning_rate, step=step) tf.summary.scalar("train_loss", loss, step=step) tf.summary.scalar("train_acc", acc, step=step) tf.summary.scalar("train_exact", exact_match, step=step) tf.summary.scalar("train_f1", f1, step=step) tf.summary.scalar("train_precision", precision, step=step) tf.summary.scalar("train_recall", recall, step=step) if do_validate: tf.summary.scalar("val_loss", val_loss, step=step) tf.summary.scalar("val_acc", val_acc, step=step) tf.summary.scalar("val_exact", val_exact_match, step=step) tf.summary.scalar("val_f1", val_f1, step=step) tf.summary.scalar("val_precision", val_precision, step=step) tf.summary.scalar("val_recall", val_recall, step=step) # And the eval metrics tensorboard_eval_metrics( summary_writer=summary_writer, results=results, step=step ) if is_final_step: break # Can we return a value only on a single rank? if hvd.rank() == 0: pbar.close() print(f"Finished finetuning, job name {run_name}") return results
pbar.close() return results if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--batch_size", type=int, default=32) parser.add_argument("--checkpoint", type=str, default=None) parser.add_argument("--pre_layer_norm", type=str, choices=["true"]) args = parser.parse_args() # Load finetuned model from checkpoint config = AutoConfig.from_pretrained("albert-base-v2") config.pre_layer_norm = args.pre_layer_norm == "true" model = TFAutoModelForQuestionAnswering.from_config(config) # XLA, AMP, tf.function tf.config.optimizer.set_jit(True) tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True}) model.call = tf.function(model.call) # Get validation dataset data_dir = "/fsx/squad_data" train_filename = "train-v2.0.json" val_filename = "dev-v2.0.json" results = get_evaluation_metrics( model=model, data_dir=data_dir, filename=val_filename, batch_size=args.batch_size ) print(dict(results))
def __init__(self, log_path: str, base_model: str) -> None: self._wiki = MediaWiki() self._entity_recognizer = TFLiteNLU(log_path) self._tokenizer = AutoTokenizer.from_pretrained(base_model) self._answerer = TFAutoModelForQuestionAnswering.from_pretrained( base_model)
def main(): # region Argument parsing # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TFTrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses() output_dir = Path(training_args.output_dir) output_dir.mkdir(parents=True, exist_ok=True) # endregion # region Checkpoints checkpoint = None if len(os.listdir(training_args.output_dir)) > 0 and not training_args.overwrite_output_dir: if (output_dir / CONFIG_NAME).is_file() and (output_dir / TF2_WEIGHTS_NAME).is_file(): checkpoint = output_dir logger.info( f"Checkpoint detected, resuming training from checkpoint in {training_args.output_dir}. To avoid this" " behavior, change the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) else: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to continue regardless." ) # endregion # region Logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if training_args.should_log else logging.WARN) # Set the verbosity to info of the Transformers logger (on main process only): if training_args.should_log: transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # endregion # Set seed before initializing model. set_seed(training_args.seed) # region Load Data # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ # (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called # 'text' is found. You can easily tweak this behavior (see below). # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.dataset_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: data_files = {} if data_args.train_file is not None: data_files["train"] = data_args.train_file extension = data_args.train_file.split(".")[-1] if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] if data_args.test_file is not None: data_files["test"] = data_args.test_file extension = data_args.test_file.split(".")[-1] datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir) # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. # endregion # region Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=True, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) # endregion # region Tokenizer check: this script requires a fast tokenizer. if not isinstance(tokenizer, PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout the big table of models " "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this " "requirement" ) # endregion # region Preprocessing the datasets # Preprocessing is slightly different for training and evaluation. if training_args.do_train: column_names = datasets["train"].column_names elif training_args.do_eval: column_names = datasets["validation"].column_names else: column_names = datasets["test"].column_names question_column_name = "question" if "question" in column_names else column_names[0] context_column_name = "context" if "context" in column_names else column_names[1] answer_column_name = "answers" if "answers" in column_names else column_names[2] # Padding side determines if we do (question|context) or (context|question). pad_on_right = tokenizer.padding_side == "right" if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) # Training preprocessing def prepare_train_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] cls_index = input_ids.index(tokenizer.cls_token_id) # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] answers = examples[answer_column_name][sample_index] # If no answers are given, set the cls_index as answer. if len(answers["answer_start"]) == 0: tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Start/end character index of the answer in the text. start_char = answers["answer_start"][0] end_char = start_char + len(answers["text"][0]) # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index). if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char): tokenized_examples["start_positions"].append(cls_index) tokenized_examples["end_positions"].append(cls_index) else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char: token_start_index += 1 tokenized_examples["start_positions"].append(token_start_index - 1) while offsets[token_end_index][1] >= end_char: token_end_index -= 1 tokenized_examples["end_positions"].append(token_end_index + 1) return tokenized_examples processed_datasets = dict() if training_args.do_train: if "train" not in datasets: raise ValueError("--do_train requires a train dataset") train_dataset = datasets["train"] if data_args.max_train_samples is not None: # We will select sample from whole data if agument is specified train_dataset = train_dataset.select(range(data_args.max_train_samples)) # Create train feature from dataset train_dataset = train_dataset.map( prepare_train_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_train_samples is not None: # Number of samples might increase during Feature Creation, We select only specified max samples train_dataset = train_dataset.select(range(data_args.max_train_samples)) processed_datasets["train"] = train_dataset # Validation preprocessing def prepare_validation_features(examples): # Some of the questions have lots of whitespace on the left, which is not useful and will make the # truncation of the context fail (the tokenized question will take a lots of space). So we remove that # left whitespace examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]] # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results # in one example possible giving several features when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. tokenized_examples = tokenizer( examples[question_column_name if pad_on_right else context_column_name], examples[context_column_name if pad_on_right else question_column_name], truncation="only_second" if pad_on_right else "only_first", max_length=max_seq_length, stride=data_args.doc_stride, return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if data_args.pad_to_max_length else False, ) # Since one example might give us several features if it has a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the # corresponding example_id and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 # One example can give several spans, this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append(examples["id"][sample_index]) # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples if training_args.do_eval: if "validation" not in datasets: raise ValueError("--do_eval requires a validation dataset") eval_examples = datasets["validation"] if data_args.max_eval_samples is not None: # We will select sample from whole data eval_examples = eval_examples.select(range(data_args.max_eval_samples)) # Validation Feature Creation eval_dataset = eval_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_eval_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again eval_dataset = eval_dataset.select(range(data_args.max_eval_samples)) processed_datasets["validation"] = eval_dataset if training_args.do_predict: if "test" not in datasets: raise ValueError("--do_predict requires a test dataset") predict_examples = datasets["test"] if data_args.max_predict_samples is not None: # We will select sample from whole data predict_examples = predict_examples.select(range(data_args.max_predict_samples)) # Predict Feature Creation predict_dataset = predict_examples.map( prepare_validation_features, batched=True, num_proc=data_args.preprocessing_num_workers, remove_columns=column_names, load_from_cache_file=not data_args.overwrite_cache, ) if data_args.max_predict_samples is not None: # During Feature creation dataset samples might increase, we will select required samples again predict_dataset = predict_dataset.select(range(data_args.max_predict_samples)) processed_datasets["test"] = predict_dataset # endregion # region Metrics and Post-processing: def post_processing_function(examples, features, predictions, stage="eval"): # Post-processing: we match the start logits and end logits to answers in the original context. predictions = postprocess_qa_predictions( examples=examples, features=features, predictions=predictions, version_2_with_negative=data_args.version_2_with_negative, n_best_size=data_args.n_best_size, max_answer_length=data_args.max_answer_length, null_score_diff_threshold=data_args.null_score_diff_threshold, output_dir=training_args.output_dir, prefix=stage, ) # Format the result to the format the metric expects. if data_args.version_2_with_negative: formatted_predictions = [ {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items() ] else: formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()] references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples] return EvalPrediction(predictions=formatted_predictions, label_ids=references) metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad") def compute_metrics(p: EvalPrediction): return metric.compute(predictions=p.predictions, references=p.label_ids) # endregion with training_args.strategy.scope(): # region Load model if checkpoint is None: model_path = model_args.model_name_or_path else: model_path = checkpoint model = TFAutoModelForQuestionAnswering.from_pretrained( model_path, config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) optimizer = tf.keras.optimizers.Adam( learning_rate=training_args.learning_rate, beta_1=training_args.adam_beta1, beta_2=training_args.adam_beta2, epsilon=training_args.adam_epsilon, clipnorm=training_args.max_grad_norm, ) def dummy_loss(y_true, y_pred): return tf.reduce_mean(y_pred) losses = {"loss": dummy_loss} model.compile(optimizer=optimizer, loss=losses) # endregion # region Training if training_args.do_train: # Make a tf.data.Dataset for this if isinstance(training_args.strategy, tf.distribute.TPUStrategy) or data_args.pad_to_max_length: logger.info("Padding all batches to max length because argument was set or we're on TPU.") dataset_mode = "constant_batch" else: dataset_mode = "variable_batch" training_dataset = convert_dataset_for_tensorflow( processed_datasets["train"], batch_size=training_args.per_device_train_batch_size, dataset_mode=dataset_mode, drop_remainder=True, shuffle=True, ) model.fit(training_dataset, epochs=int(training_args.num_train_epochs)) # endregion # region Evaluation if training_args.do_eval: logger.info("*** Evaluation ***") eval_inputs = { "input_ids": tf.ragged.constant(processed_datasets["validation"]["input_ids"]).to_tensor(), "attention_mask": tf.ragged.constant(processed_datasets["validation"]["attention_mask"]).to_tensor(), } eval_predictions = model.predict(eval_inputs) post_processed_eval = post_processing_function( datasets["validation"], processed_datasets["validation"], (eval_predictions.start_logits, eval_predictions.end_logits), ) metrics = compute_metrics(post_processed_eval) logging.info("Evaluation metrics:") for metric, value in metrics.items(): logging.info(f"{metric}: {value:.3f}") # endregion # region Prediction if training_args.do_predict: logger.info("*** Predict ***") predict_inputs = { "input_ids": tf.ragged.constant(processed_datasets["test"]["input_ids"]).to_tensor(), "attention_mask": tf.ragged.constant(processed_datasets["test"]["attention_mask"]).to_tensor(), } test_predictions = model.predict(predict_inputs) post_processed_test = post_processing_function( datasets["test"], processed_datasets["test"], (test_predictions.start_logits, test_predictions.end_logits), ) metrics = compute_metrics(post_processed_test) logging.info("Test metrics:") for metric, value in metrics.items(): logging.info(f"{metric}: {value:.3f}") # endregion if training_args.push_to_hub: model.push_to_hub()
# -*- coding: utf-8 -*- import pandas as pd data_bert_df = pd.read_csv(r"data_bert.csv") from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering import tensorflow as tf tokenizer = AutoTokenizer.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad") model = TFAutoModelForQuestionAnswering.from_pretrained( "bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True) def tester(text, question, answer): questions = [question] for question in questions: inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf") input_ids = inputs["input_ids"].numpy()[0] text_tokens = tokenizer.convert_ids_to_tokens(input_ids) answer_scores = model(inputs) answer_start = tf.argmax(answer_scores["start_logits"], axis=1).numpy( )[0] # Get the most likely beginning of answer with the argmax of the score answer_end = ( tf.argmax(answer_scores["end_logits"], axis=1) + 1 ).numpy( )[0] # Get the most likely end of answer with the argmax of the score answer = tokenizer.convert_tokens_to_string(
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TFTrainingArguments)) model_args, data_args, training_args = parser.parse_args_into_dataclasses() if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, ) logger.info( "n_replicas: %s, distributed training: %s, 16-bits training: %s", training_args.n_replicas, bool(training_args.n_replicas > 1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Prepare Question-Answering task # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast, ) with training_args.strategy.scope(): model = TFAutoModelForQuestionAnswering.from_pretrained( model_args.model_name_or_path, from_pt=bool(".bin" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets if data_args.use_tfds: if data_args.version_2_with_negative: logger.warn( "tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically" ) try: import tensorflow_datasets as tfds except ImportError: raise ImportError( "If not data_dir is specified, tensorflow_datasets needs to be installed." ) tfds_examples = tfds.load("squad", data_dir=data_args.data_dir) train_examples = (SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=False) if training_args.do_train else None) eval_examples = (SquadV1Processor().get_examples_from_dataset( tfds_examples, evaluate=True) if training_args.do_eval else None) else: processor = SquadV2Processor( ) if data_args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples( data_args.data_dir) if training_args.do_train else None eval_examples = processor.get_dev_examples( data_args.data_dir) if training_args.do_eval else None train_dataset = (squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=True, return_dataset="tf", ) if training_args.do_train else None) train_dataset = train_dataset.apply( tf.data.experimental.assert_cardinality(len(train_examples))) eval_dataset = (squad_convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=data_args.max_seq_length, doc_stride=data_args.doc_stride, max_query_length=data_args.max_query_length, is_training=False, return_dataset="tf", ) if training_args.do_eval else None) eval_dataset = eval_dataset.apply( tf.data.experimental.assert_cardinality(len(eval_examples))) # Initialize our Trainer trainer = TFTrainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, ) # Training if training_args.do_train: trainer.train() trainer.save_model() tokenizer.save_pretrained(training_args.output_dir)
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments, LoggingArguments, PathArguments) ) model_args, data_args, train_args, log_args, path_args = parser.parse_args_into_dataclasses() tf.random.set_seed(train_args.seed) tf.autograph.set_verbosity(0) level = logging.INFO format = "%(asctime)-15s %(name)-12s: %(levelname)-8s %(message)s" handlers = [ TqdmLoggingHandler(), ] logging.basicConfig(level=level, format=format, handlers=handlers) # Horovod init hvd.init() gpus = tf.config.list_physical_devices("GPU") for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) if gpus: tf.config.set_visible_devices(gpus[hvd.local_rank()], "GPU") # XLA, AMP, AutoGraph parse_bool = lambda arg: arg == "true" tf.config.optimizer.set_jit(not parse_bool(train_args.skip_xla)) tf.config.experimental_run_functions_eagerly(parse_bool(train_args.eager)) if hvd.rank() == 0: # Run name should only be used on one process to avoid race conditions current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") platform = "eks" if path_args.filesystem_prefix == "/fsx" else "sm" if log_args.run_name is None: run_name = f"{current_time}-{platform}-{model_args.model_type}-{model_args.model_size}-{data_args.squad_version}-{model_args.load_from}-{hvd.size()}gpus-{train_args.name}" else: run_name = log_args.run_name else: # We only use run_name on rank 0, but need all ranks to pass a value in function args run_name = None if model_args.load_from == "huggingface": logger.info(f"Loading weights from Huggingface {model_args.model_desc}") model = TFAutoModelForQuestionAnswering.from_pretrained(model_args.model_desc) else: model = create_model(model_class=TFAutoModelForQuestionAnswering, model_args=model_args) model.call = rewrap_tf_function(model.call) tokenizer = create_tokenizer(model_args.model_type) loaded_optimizer_weights = None if model_args.load_from == "checkpoint": if hvd.rank() == 0: checkpoint_path = os.path.join(path_args.filesystem_prefix, model_args.checkpoint_path) logger.info(f"Loading weights from {checkpoint_path}.ckpt") model.load_weights(f"{checkpoint_path}.ckpt").expect_partial() results = run_squad_and_get_results( model=model, tokenizer=tokenizer, run_name=run_name, filesystem_prefix=path_args.filesystem_prefix, per_gpu_batch_size=train_args.per_gpu_batch_size, checkpoint_frequency=log_args.checkpoint_frequency, validate_frequency=log_args.validation_frequency, evaluate_frequency=log_args.evaluate_frequency, learning_rate=train_args.learning_rate, warmup_steps=train_args.warmup_steps, total_steps=train_args.total_steps, dataset=data_args.squad_version, ) if hvd.rank() == 0: logger.info(results)