def convert_longformer_qa_checkpoint_to_pytorch( longformer_model: str, longformer_question_answering_ckpt_path: str, pytorch_dump_folder_path: str): # load longformer model from model identifier longformer = LongformerModel.from_pretrained(longformer_model) lightning_model = LightningModel(longformer) ckpt = torch.load(longformer_question_answering_ckpt_path, map_location=torch.device("cpu")) lightning_model.load_state_dict(ckpt["state_dict"]) # init longformer question answering model longformer_for_qa = LongformerForQuestionAnswering.from_pretrained( longformer_model) # transfer weights longformer_for_qa.longformer.load_state_dict( lightning_model.model.state_dict()) longformer_for_qa.qa_outputs.load_state_dict( lightning_model.qa_outputs.state_dict()) longformer_for_qa.eval() # save model longformer_for_qa.save_pretrained(pytorch_dump_folder_path) print( f"Conversion successful. Model saved under {pytorch_dump_folder_path}")
def load(self, k): while self.m.get(k, None) == -1: time.sleep(1) # loading, wit till ready if self.m.get(k, None) is not None: return self.m[k] # it's already loaded self.m[k] = -1 # tell others it's loading, wait m = None if k == 'sentence-encode': m = SentenceTransformer('roberta-base-nli-stsb-mean-tokens') # word_embedding_model = models.Transformer('allenai/longformer-base-4096') # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) # m = SentenceTransformer(modules=[word_embedding_model, pooling_model]) elif k == 'sentiment-analysis': tokenizer = AutoTokenizer.from_pretrained( "mrm8488/t5-base-finetuned-emotion") model = AutoModelWithLMHead.from_pretrained( "mrm8488/t5-base-finetuned-emotion").to("cuda") # TODO we sure it's not ForSequenceClassification? https://huggingface.co/mrm8488/t5-base-finetuned-emotion m = (tokenizer, model, 512) elif k == 'summarization': # Not using pipelines because can't handle >max_tokens # https://github.com/huggingface/transformers/issues/4501 # https://github.com/huggingface/transformers/issues/4224 max_tokens = 1024 # 4096 tokenizer = BartTokenizer.from_pretrained( 'facebook/bart-large-cnn') model = BartForConditionalGeneration.from_pretrained( 'facebook/bart-large-cnn').to("cuda") # model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16").to("cuda") # tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096") m = (tokenizer, model, max_tokens) elif k == 'question-answering': tokenizer = LongformerTokenizer.from_pretrained( "allenai/longformer-large-4096-finetuned-triviaqa") model = LongformerForQuestionAnswering.from_pretrained( "allenai/longformer-large-4096-finetuned-triviaqa", return_dict=True).to("cuda") # tokenizer = AutoTokenizer.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2") # model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2", return_dict=True).to("cuda") m = (tokenizer, model, 4096) self.m[k] = m return m
def create_model(self, transformer="longformer"): if transformer == "distilbert": from transformers import DistilBertForQuestionAnswering self.model = DistilBertForQuestionAnswering.from_pretrained( "distilbert-base-uncased") elif transformer == "bert": from transformers import BertForQuestionAnswering self.model = BertForQuestionAnswering.from_pretrained( "bert-base-uncased") elif transformer == "roberta": from transformers import RobertaForQuestionAnswering self.model = RobertaForQuestionAnswering.from_pretrained( "roberta-base") elif transformer == "roberta_squad": from transformers import RobertaForQuestionAnswering self.model = RobertaForQuestionAnswering.from_pretrained( "deepset/roberta-base-squad2") elif transformer == "longformer": from transformers import LongformerForQuestionAnswering self.model = LongformerForQuestionAnswering.from_pretrained( "allenai/longformer-base-4096") elif transformer == "bart": from transformers import BartForQuestionAnswering self.model = BartForQuestionAnswering.from_pretrained( "facebook/bart-base") elif transformer == "electra": from transformers import ElectraForQuestionAnswering self.model = ElectraForQuestionAnswering.from_pretrained( "google/electra-small-discriminator") else: print( "The model you chose is not available in this version. You can try to manually change the code or manually overwrite the variable self.model" ) print( "The available choices are 'distilbert' , 'bert' , 'roberta' , 'longformer' , 'bart' , 'electra' " )
def main(): parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath('args.json')) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) if (os.path.exists(training_args.output_dir) and os.listdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) # Set seed set_seed(training_args.seed) # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = LongformerTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = LongformerForQuestionAnswering.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) # Get datasets #train_dataset = torch.load(data_args.train_file_path) #eval_dataset = torch.load(data_args.valid_file_path) train_examples = DeepThinkDataset(data_args.input_train_file) train_dataset = DTDataset(tokenizer, train_examples, data_args.max_seq_length) eval_examples = DeepThinkDataset(data_args.input_eval_file) eval_dataset = DTDataset(tokenizer, eval_examples, data_args.max_seq_length) # Initialize our Trainer trainer = Trainer( model=model, tokenizer=tokenizer, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=DummyDataCollator(), prediction_loss_only=True, ) # Training if training_args.do_train: trainer.train(model_path=model_args.model_name_or_path if os.path. isdir(model_args.model_name_or_path) else None) trainer.save_model() # For convenience, we also re-save the tokenizer to the same directory, # so that you can share your model easily on huggingface.co/models =) if trainer.is_world_master(): tokenizer.save_pretrained(training_args.output_dir) # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") eval_output = trainer.evaluate() output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(eval_output.keys()): logger.info(" %s = %s", key, str(eval_output[key])) writer.write("%s = %s\n" % (key, str(eval_output[key]))) results.update(eval_output) return results
prediction, ground_truths) f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total return {'exact_match': exact_match, 'f1': f1} import torch from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering from tqdm.auto import tqdm tokenizer = LongformerTokenizerFast.from_pretrained('models') model = LongformerForQuestionAnswering.from_pretrained('models') model = model.cuda() model.eval() valid_dataset = torch.load('./data/valid_data.pt') dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=16) answers = [] with torch.no_grad(): for batch in tqdm(dataloader): start_scores, end_scores = model( input_ids=batch['input_ids'].cuda(), attention_mask=batch['attention_mask'].cuda()) for i in range(start_scores.shape[0]): all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i]) answer = ' '.join(
# y = torch.rand(100) # # print(x.shape[0], y.shape[0]) # # print(x) # print(y) # # z = answer_span_evaluation_in_sentence(start_scores=x, end_scores=y, max_ans_decode_len=20, debug=True) # print(z) from transformers import LongformerTokenizer, LongformerForQuestionAnswering import torch tokenizer = LongformerTokenizer.from_pretrained( "allenai/longformer-large-4096-finetuned-triviaqa") model = LongformerForQuestionAnswering.from_pretrained( "allenai/longformer-large-4096-finetuned-triviaqa") question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" encoding = tokenizer(question, text, return_tensors="pt") input_ids = encoding["input_ids"] # default is local attention everywhere # the forward method will automatically set global attention on question tokens attention_mask = encoding["attention_mask"] outputs = model(input_ids, attention_mask=attention_mask) print(outputs) start_logits = outputs[0] end_logits = outputs[1] all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
train_dataset = EncodingDataset(train_encodings) valid_dataset = EncodingDataset(valid_encodings) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=True) #%% Model & Optimizer & Scheduler device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') args.wgts_dir = '/media/mynewdrive/rob/data/pre_wgts/longformer_base' model = LongformerForQuestionAnswering.from_pretrained(args.wgts_dir) # model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-base-4096") model.to(device) optimizer = AdamW(model.parameters(), lr=5e-5) # Slanted triangular Learning rate scheduler total_steps = len(train_loader) * args.num_epochs // args.accum_step warm_steps = int(total_steps * args.warm_frac) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warm_steps, num_training_steps=total_steps) #%% Train the model if os.path.exists(args.exp_dir) == False: os.makedirs(args.exp_dir)