def convert_longformer_qa_checkpoint_to_pytorch(
        longformer_model: str, longformer_question_answering_ckpt_path: str,
        pytorch_dump_folder_path: str):

    # load longformer model from model identifier
    longformer = LongformerModel.from_pretrained(longformer_model)
    lightning_model = LightningModel(longformer)

    ckpt = torch.load(longformer_question_answering_ckpt_path,
                      map_location=torch.device("cpu"))
    lightning_model.load_state_dict(ckpt["state_dict"])

    # init longformer question answering model
    longformer_for_qa = LongformerForQuestionAnswering.from_pretrained(
        longformer_model)

    # transfer weights
    longformer_for_qa.longformer.load_state_dict(
        lightning_model.model.state_dict())
    longformer_for_qa.qa_outputs.load_state_dict(
        lightning_model.qa_outputs.state_dict())
    longformer_for_qa.eval()

    # save model
    longformer_for_qa.save_pretrained(pytorch_dump_folder_path)

    print(
        f"Conversion successful. Model saved under {pytorch_dump_folder_path}")
Exemplo n.º 2
0
 def load(self, k):
     while self.m.get(k, None) == -1:
         time.sleep(1)  # loading, wit till ready
     if self.m.get(k, None) is not None:
         return self.m[k]  # it's already loaded
     self.m[k] = -1  # tell others it's loading, wait
     m = None
     if k == 'sentence-encode':
         m = SentenceTransformer('roberta-base-nli-stsb-mean-tokens')
         # word_embedding_model = models.Transformer('allenai/longformer-base-4096')
         # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
         # m = SentenceTransformer(modules=[word_embedding_model, pooling_model])
     elif k == 'sentiment-analysis':
         tokenizer = AutoTokenizer.from_pretrained(
             "mrm8488/t5-base-finetuned-emotion")
         model = AutoModelWithLMHead.from_pretrained(
             "mrm8488/t5-base-finetuned-emotion").to("cuda")
         # TODO we sure it's not ForSequenceClassification? https://huggingface.co/mrm8488/t5-base-finetuned-emotion
         m = (tokenizer, model, 512)
     elif k == 'summarization':
         # Not using pipelines because can't handle >max_tokens
         # https://github.com/huggingface/transformers/issues/4501
         # https://github.com/huggingface/transformers/issues/4224
         max_tokens = 1024  # 4096
         tokenizer = BartTokenizer.from_pretrained(
             'facebook/bart-large-cnn')
         model = BartForConditionalGeneration.from_pretrained(
             'facebook/bart-large-cnn').to("cuda")
         # model = EncoderDecoderModel.from_pretrained("patrickvonplaten/longformer2roberta-cnn_dailymail-fp16").to("cuda")
         # tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
         m = (tokenizer, model, max_tokens)
     elif k == 'question-answering':
         tokenizer = LongformerTokenizer.from_pretrained(
             "allenai/longformer-large-4096-finetuned-triviaqa")
         model = LongformerForQuestionAnswering.from_pretrained(
             "allenai/longformer-large-4096-finetuned-triviaqa",
             return_dict=True).to("cuda")
         # tokenizer = AutoTokenizer.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2")
         # model = AutoModelForQuestionAnswering.from_pretrained("mrm8488/longformer-base-4096-finetuned-squadv2", return_dict=True).to("cuda")
         m = (tokenizer, model, 4096)
     self.m[k] = m
     return m
 def create_and_check_for_question_answering(
     self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 ):
     model = LongformerForQuestionAnswering(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         global_attention_mask=input_mask,
         token_type_ids=token_type_ids,
         start_positions=sequence_labels,
         end_positions=sequence_labels,
     )
     self.parent.assertEqual(result.start_logits.shape, (self.batch_size, self.seq_length))
     self.parent.assertEqual(result.end_logits.shape, (self.batch_size, self.seq_length))
Exemplo n.º 4
0
 def create_model(self, transformer="longformer"):
     if transformer == "distilbert":
         from transformers import DistilBertForQuestionAnswering
         self.model = DistilBertForQuestionAnswering.from_pretrained(
             "distilbert-base-uncased")
     elif transformer == "bert":
         from transformers import BertForQuestionAnswering
         self.model = BertForQuestionAnswering.from_pretrained(
             "bert-base-uncased")
     elif transformer == "roberta":
         from transformers import RobertaForQuestionAnswering
         self.model = RobertaForQuestionAnswering.from_pretrained(
             "roberta-base")
     elif transformer == "roberta_squad":
         from transformers import RobertaForQuestionAnswering
         self.model = RobertaForQuestionAnswering.from_pretrained(
             "deepset/roberta-base-squad2")
     elif transformer == "longformer":
         from transformers import LongformerForQuestionAnswering
         self.model = LongformerForQuestionAnswering.from_pretrained(
             "allenai/longformer-base-4096")
     elif transformer == "bart":
         from transformers import BartForQuestionAnswering
         self.model = BartForQuestionAnswering.from_pretrained(
             "facebook/bart-base")
     elif transformer == "electra":
         from transformers import ElectraForQuestionAnswering
         self.model = ElectraForQuestionAnswering.from_pretrained(
             "google/electra-small-discriminator")
     else:
         print(
             "The model you chose is not available in this version. You can try to manually change the code or manually overwrite the variable self.model"
         )
         print(
             "The available choices are 'distilbert' , 'bert' , 'roberta' , 'longformer' , 'bart' , 'electra' "
         )
 def create_and_check_longformer_for_question_answering(
         self, config, input_ids, token_type_ids, input_mask,
         sequence_labels, token_labels, choice_labels):
     model = LongformerForQuestionAnswering(config=config)
     model.to(torch_device)
     model.eval()
     result = model(
         input_ids,
         attention_mask=input_mask,
         global_attention_mask=input_mask,
         token_type_ids=token_type_ids,
         start_positions=sequence_labels,
         end_positions=sequence_labels,
     )
     self.parent.assertListEqual(list(result["start_logits"].size()),
                                 [self.batch_size, self.seq_length])
     self.parent.assertListEqual(list(result["end_logits"].size()),
                                 [self.batch_size, self.seq_length])
     self.check_loss_output(result)
Exemplo n.º 6
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath('args.json'))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    tokenizer = LongformerTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = LongformerForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    #train_dataset  = torch.load(data_args.train_file_path)
    #eval_dataset = torch.load(data_args.valid_file_path)
    train_examples = DeepThinkDataset(data_args.input_train_file)
    train_dataset = DTDataset(tokenizer, train_examples,
                              data_args.max_seq_length)
    eval_examples = DeepThinkDataset(data_args.input_eval_file)
    eval_dataset = DTDataset(tokenizer, eval_examples,
                             data_args.max_seq_length)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DummyDataCollator(),
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))

        results.update(eval_output)

    return results
Exemplo n.º 7
0
                                                     prediction, ground_truths)
        f1 += metric_max_over_ground_truths(f1_score, prediction,
                                            ground_truths)

    exact_match = 100.0 * exact_match / total
    f1 = 100.0 * f1 / total

    return {'exact_match': exact_match, 'f1': f1}


import torch
from transformers import LongformerTokenizerFast, LongformerForQuestionAnswering
from tqdm.auto import tqdm

tokenizer = LongformerTokenizerFast.from_pretrained('models')
model = LongformerForQuestionAnswering.from_pretrained('models')
model = model.cuda()
model.eval()

valid_dataset = torch.load('./data/valid_data.pt')
dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=16)

answers = []
with torch.no_grad():
    for batch in tqdm(dataloader):
        start_scores, end_scores = model(
            input_ids=batch['input_ids'].cuda(),
            attention_mask=batch['attention_mask'].cuda())
        for i in range(start_scores.shape[0]):
            all_tokens = tokenizer.convert_ids_to_tokens(batch['input_ids'][i])
            answer = ' '.join(
Exemplo n.º 8
0
# y = torch.rand(100)
#
# print(x.shape[0], y.shape[0])
#
# print(x)
# print(y)
#
# z = answer_span_evaluation_in_sentence(start_scores=x, end_scores=y, max_ans_decode_len=20, debug=True)
# print(z)

from transformers import LongformerTokenizer, LongformerForQuestionAnswering
import torch

tokenizer = LongformerTokenizer.from_pretrained(
    "allenai/longformer-large-4096-finetuned-triviaqa")
model = LongformerForQuestionAnswering.from_pretrained(
    "allenai/longformer-large-4096-finetuned-triviaqa")

question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
encoding = tokenizer(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]

# default is local attention everywhere
# the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"]

outputs = model(input_ids, attention_mask=attention_mask)
print(outputs)
start_logits = outputs[0]
end_logits = outputs[1]
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
Exemplo n.º 9
0
train_dataset = EncodingDataset(train_encodings)
valid_dataset = EncodingDataset(valid_encodings)

train_loader = DataLoader(train_dataset,
                          batch_size=args.batch_size,
                          shuffle=True)
valid_loader = DataLoader(valid_dataset,
                          batch_size=args.batch_size,
                          shuffle=True)

#%% Model & Optimizer & Scheduler
device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')
args.wgts_dir = '/media/mynewdrive/rob/data/pre_wgts/longformer_base'
model = LongformerForQuestionAnswering.from_pretrained(args.wgts_dir)
# model = LongformerForQuestionAnswering.from_pretrained("allenai/longformer-base-4096")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

# Slanted triangular Learning rate scheduler
total_steps = len(train_loader) * args.num_epochs // args.accum_step
warm_steps = int(total_steps * args.warm_frac)
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warm_steps,
                                            num_training_steps=total_steps)

#%% Train the model
if os.path.exists(args.exp_dir) == False:
    os.makedirs(args.exp_dir)