예제 #1
0
def loadAbstractSummarizer():
    from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
    from transformers import LEDForConditionalGeneration, LEDTokenizer

    model_name = get_item("abstract_summarizer_model_name")

    MODEL_DIRECOTRY = f'./models/{model_name}/'

    use_bart = "bart" in model_name
    if os.path.exists(MODEL_DIRECOTRY):
        if use_bart:
            model = BartForConditionalGeneration.from_pretrained(
                MODEL_DIRECOTRY)
            tokenizer = BartTokenizer.from_pretrained(MODEL_DIRECOTRY)
        else:
            model = LEDForConditionalGeneration.from_pretrained(
                MODEL_DIRECOTRY, return_dict_in_generate=True)
            tokenizer = LEDTokenizer.from_pretrained(MODEL_DIRECOTRY)
    else:
        if use_bart:
            model = BartForConditionalGeneration.from_pretrained(model_name)
            tokenizer = BartTokenizer.from_pretrained(model_name)
        else:
            model = LEDForConditionalGeneration.from_pretrained(
                model_name, return_dict_in_generate=True)
            tokenizer = LEDTokenizer.from_pretrained(model_name)

        model.save_pretrained(MODEL_DIRECOTRY)
        tokenizer.save_pretrained(MODEL_DIRECOTRY)
    return model, tokenizer
def get_model_tokenizer(model_name):
    import torch
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if "pegasus" in model_name:
        #its a pegasus model
        from transformers import PegasusForConditionalGeneration, PegasusTokenizer
        tokenizer = PegasusTokenizer.from_pretrained(model_name)
        model = PegasusForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-large" in model_name:
        # its a bart-model
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    elif "bart-custom-large" in model_name:
        from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartForConditionalGeneration.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer

    else:
        # T5 or distilbart
        from transformers import AutoTokenizer, AutoModelWithLMHead
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelWithLMHead.from_pretrained(model_name).to(
            torch_device)
        return model, tokenizer
예제 #3
0
파일: bart_sum.py 프로젝트: monkidea/DocSum
    def __init__(self,
                 device=None,
                 checkpoint=None,
                 state_dict_key='model',
                 pretrained="facebook/bart-large-cnn",
                 hg_transformers=True):
        if not hg_transformers and checkpoint:
            raise Exception(
                "hg_transformers must be set to True in order to load from checkpoint"
            )

        if not device:
            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")

        # huggingface uses dashes and fairseq/torchhub uses dots (periods)
        if pretrained:
            if hg_transformers:
                pretrained = pretrained.replace(".", "-")
            else:
                # only use the part after the "/"
                pretrained = pretrained.split("/")[-1].replace("-", ".")

        if checkpoint != None and "semsim" in checkpoint:
            cache_dir = appdirs.user_cache_dir("DocSum", "HHousen")
            output_file_path = os.path.join(cache_dir, "bart_semsim.pt")
            if not os.path.isfile(output_file_path):
                if not os.path.exists(cache_dir):
                    os.makedirs(cache_dir)
                gdown.download(
                    "https://drive.google.com/uc?id=1CNgK6ZkaqUD239h_6GkLmfUOGgryc2v9",
                    output_file_path)
            checkpoint = output_file_path

        if checkpoint:
            loaded_checkpoint = torch.load(checkpoint)
            model_state_dict = loaded_checkpoint[state_dict_key]

            bart = BartForConditionalGeneration.from_pretrained(
                pretrained, state_dict=model_state_dict)
            tokenizer = BartTokenizer.from_pretrained(
                pretrained, state_dict=model_state_dict)
            self.tokenizer = tokenizer
        else:
            if hg_transformers:
                bart = BartForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = BartTokenizer.from_pretrained(pretrained)
                self.tokenizer = tokenizer
            else:
                bart = torch.hub.load('pytorch/fairseq', pretrained)
                bart.to(device)
                bart.eval()
                bart.half()

        self.logger = logging.getLogger(__name__)
        self.hg_transformers = hg_transformers
        self.bart = bart
예제 #4
0
def launch_bart():
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    config = BartConfig.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained(
        'facebook/bart-large-cnn', num_labels=len(tags_vals))
    model_path = args.save + 'bart_trained.pt'
    ## ---------12 . Optimizer -> weight regularization is  a solution to reduce the overfitting of a deep learning
    """ 
    Last keras optimization 2020 (rates from 0.01 seem to be best hyperparamater )for weight regularization for weights layers
        from keras.layers import LSTM
        from keras.regularizers import l2
    model.add(LSTM(32, kernel_regularizer=l2(0.01), recurrent_regularizer=l2(0.01), bias_regularizer=l2(0.01))) 
    Note :  BERT not include beta an gamma parametres for optimization
    """
    FULL_FINETUNING = True
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{
            "params": [p for n, p in param_optimizer]
        }]

    optimizer = Adam(optimizer_grouped_parameters, lr=args.lr)
    launch_training(training_path=args.training_data,
                    training_epochs=4,
                    valid_path=args.validate_data,
                    training_batch_size=1,
                    model=model,
                    model_path=model_path,
                    tokenizer=tokenizer,
                    optimizer=optimizer)
    print(model_path)
    model = BartForConditionalGeneration.from_pretrained(args.save)
    launch_test_without_label(test_path=args.test_data,
                              model=model,
                              tokenizer=tokenizer)
예제 #5
0
파일: models.py 프로젝트: Benjamintdk/YORO
def load_hf_model(config, pretrained=False, path=None):
    if pretrained:
        if path:
            model = BartForConditionalGeneration.from_pretrained(
                "facebook/bart-large-cnn",
                state_dict=torch.load(path, map_location=torch.device('cuda')),
                config=config)
        else:
            model = BartForConditionalGeneration.from_pretrained(
                "facebook/bart-large-cnn", config=config)
    else:
        model = BartForConditionalGeneration()

    return model
예제 #6
0
    def test_diverse_beam_search(self):
        article = """Justin Timberlake and Jessica Biel, welcome to parenthood.
        The celebrity couple announced the arrival of their son, Silas Randall Timberlake, in statements to People.
        "Silas was the middle name of Timberlake's maternal grandfather Bill Bomar, who died in 2012, while Randall is the musician's own middle name, as well as his father's first," People reports.
        The couple announced the pregnancy in January, with an Instagram post. It is the first baby for both."""

        bart_tokenizer = BartTokenizer.from_pretrained(
            "facebook/bart-large-cnn")
        bart_model = BartForConditionalGeneration.from_pretrained(
            "facebook/bart-large-cnn").to(torch_device)
        input_ids = bart_tokenizer(
            article, return_tensors="pt").input_ids.to(torch_device)

        outputs = bart_model.generate(input_ids,
                                      num_beams=4,
                                      num_return_sequences=2,
                                      num_beam_groups=4,
                                      diversity_penalty=2.0)

        generated_text = bart_tokenizer.batch_decode(outputs,
                                                     skip_special_tokens=True)

        self.assertListEqual(
            generated_text,
            [
                "The couple announced the birth of their son, Silas Randall Timberlake, in a statement. Silas was the middle name of Timberlake's maternal grandfather Bill Bomar. Randall is the musician's own middle name, as well as his father's first. It is the first baby for both of them.",
                "Justin Timberlake and Jessica Biel have a son. The baby is named Silas Randall Timberlake. It is the first child for both. The couple announced the pregnancy in January. The name Silas is the middle name of Timberlake's maternal grandfather. It's also his own middle name.",
            ],
        )
예제 #7
0
    def test_xsum_summarization_same_as_fairseq(self):
        model = BartForConditionalGeneration.from_pretrained(
            "facebook/bart-large-xsum").to(torch_device)
        self.assertFalse(model.config.is_valid_mbart())
        tok = self.default_tokenizer

        EXPECTED_SUMMARY = "California's largest power company has begun shutting off electricity to thousands of customers in the state."
        dct = tok.batch_encode_plus(
            [PGE_ARTICLE],
            max_length=1024,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        ).to(torch_device)

        hypotheses_batch = model.generate(
            input_ids=dct["input_ids"],
            attention_mask=dct["attention_mask"],
            num_beams=2,
            max_length=62,
            min_length=11,
            length_penalty=1.0,
            no_repeat_ngram_size=3,
            early_stopping=True,
            decoder_start_token_id=model.config.eos_token_id,
        )

        decoded = tok.batch_decode(
            hypotheses_batch,
            skip_special_tokens=True,
        )
        self.assertEqual(EXPECTED_SUMMARY, decoded[0])
예제 #8
0
def generate_summaries(lns, out_file, batch_size=8, device=DEFAULT_DEVICE):
    fout = Path(out_file).open("w")
    model = BartForConditionalGeneration.from_pretrained(
        "bart-large-cnn",
        output_past=True,
    ).to(device)
    tokenizer = BartTokenizer.from_pretrained("bart-large")
    for batch in tqdm(list(chunks(lns, batch_size))):
        dct = tokenizer.batch_encode_plus(batch,
                                          max_length=1024,
                                          return_tensors="pt",
                                          pad_to_max_length=True)
        summaries = model.generate(
            input_ids=dct["input_ids"].to(device),
            attention_mask=dct["attention_mask"].to(device),
            num_beams=4,
            length_penalty=2.0,
            max_length=
            142,  # +2 from original because we start at step=1 and stop before max_length
            min_length=56,  # +1 from original because we start at step=1
            no_repeat_ngram_size=3,
            early_stopping=True,
            do_sample=False,
        )
        dec = [
            tokenizer.decode(g,
                             skip_special_tokens=True,
                             clean_up_tokenization_spaces=False)
            for g in summaries
        ]
        for hypothesis in dec:
            fout.write(hypothesis + "\n")
            fout.flush()
 def model(self):
     """Only load the model if needed."""
     if self._model is None:
         model = BartForConditionalGeneration.from_pretrained(
             "mbart-large-en-ro")
         self._model = model.to(torch_device)
     return self._model
예제 #10
0
    def __init__(self,
                 hparams,
                 user_tokens=['<newline>', '<bullet>', '<sep>']):
        super(BartSystem, self).__init__()
        self.hparams = hparams
        self.hparams.model_type = self.hparams.model_type.lower()
        tokenizer = BartTokenizer.from_pretrained(
            self.hparams.tokenizer_name if self.hparams.tokenizer_name else
            self.hparams.model_name_or_path,
            do_lower_case=self.hparams.do_lower_case,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )

        config = AutoConfig.from_pretrained(
            self.hparams.config_name
            if self.hparams.config_name else self.hparams.model_name_or_path,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
            output_past=self.hparams.do_test,
            vocab_size=len(tokenizer))

        model = BartForConditionalGeneration.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=config,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )

        self.config, self.tokenizer, self.model = config, tokenizer, model
        self.loss = []  # for keeping track of average loss
        self.metrics = {}

        self.vocab = {v: k for k, v in self.tokenizer.get_vocab().items()}
예제 #11
0
def load_hf_model(config, pretrained=True, path=None):
    if pretrained:
        if path:
            model = BartForConditionalGeneration.from_pretrained(
                "bart-large-cnn",
                state_dict=torch.load(path,
                                      map_location=torch.device(
                                          settings.DEVICE)),
                config=config)
        else:
            model = BartForConditionalGeneration.from_pretrained(
                "bart-large-cnn", config=config)
    else:
        model = BartForConditionalGeneration()

    return model.to(settings.DEVICE)
예제 #12
0
def bart_summarize(input_file):
    model = BartForConditionalGeneration.from_pretrained(
        'facebook/bart-large-cnn')
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

    num_count = get_num_pages(input_file)
    f = open('summarized_bart.txt', 'a+')

    count = 0
    while count < num_count:
        text = pdf_to_text(input_file, count)
        ARTICLE_TO_SUMMARIZE = text
        inputs = tokenizer([ARTICLE_TO_SUMMARIZE],
                           max_length=1024,
                           return_tensors='pt')
        # Generate Summary
        summary_ids = model.generate(inputs['input_ids'],
                                     num_beams=4,
                                     max_length=5,
                                     early_stopping=True)
        summarized_text = [
            tokenizer.decode(g,
                             skip_special_tokens=True,
                             clean_up_tokenization_spaces=False)
            for g in summary_ids
        ]
        print(summarized_text)
        str1 = ''.join(summarized_text)
        print(str1)
        f.write(str1)
        count += 1

    f.close()
예제 #13
0
    def __init__(self, sum_method):
        super().__init__()
        self.summarization_methods = {
            'simple': self.simple_summarizer,
            'GPT2': self.gpt2_summarizer,
            'xlnet': self.xlnet_summarizer,
            'bart': self.bart_summarizer,
            't5': self.t5_summarizer
        }

        # Not using dictionary to avoid creating models that are not going to be used
        if sum_method == 'GPT2':
            self.model = TransformerSummarizer(
                transformer_type="GPT2", transformer_model_key="gpt2-medium")
        if sum_method == 'xlnet':
            self.model = TransformerSummarizer(
                transformer_type="XLNet",
                transformer_model_key="xlnet-base-cased")
        if sum_method == 'bart':
            self.model = BartForConditionalGeneration.from_pretrained(
                'facebook/bart-large-cnn')
        if sum_method == 't5':
            self.model = T5ForConditionalGeneration.from_pretrained('t5-small')

        self.sum_method_name = sum_method
        self.summarization_method = self.summarization_methods[sum_method]
예제 #14
0
def main():
    print("initializing bart tokenizer...")
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    print("creating lightseq model...")
    ls_model = lsi.Transformer("lightseq_bart_base.pb", 128)
    print("creating huggingface model...")
    hf_model = BartForConditionalGeneration.from_pretrained(
        "facebook/bart-base")

    # GPU warm up
    sentences = [" ".join(["I"] * 10)] * 8
    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
    inputs_id = inputs["input_ids"]
    _, _ = ls_bart(ls_model, inputs_id)
    _, _ = hf_bart(hf_model, inputs_id)

    bsz_list = [1, 2, 4, 8, 16, 32, 64, 128]
    seq_len_list = [1, 2, 4, 8, 16, 32]
    for bsz in bsz_list:
        total_ls = 0.0
        total_hf = 0.0
        for seq_len in seq_len_list:
            sentences = [" ".join(["I"] * seq_len)] * bsz
            inputs = tokenizer(sentences, return_tensors="pt", padding=True)
            inputs_id = inputs["input_ids"]
            _, ls_time = ls_bart(ls_model, inputs_id)
            _, hf_time = hf_bart(hf_model, inputs_id)
            total_ls += ls_time
            total_hf += hf_time
        print(f"{bsz}: {total_hf/total_ls-1}")
예제 #15
0
    def __init__(self, model: str = None):
        log.info(model)
        torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        log.info(torch_device)
        if model is None:
            model = "t5"
        self.modelName = model
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        self.model_path = self.path + "pytorch_model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
            self.model.eval()
            self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device))
        elif model == "google/pegasus-newsroom":
            self.config = PegasusConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = PegasusTokenizer.from_pretrained(model)
        elif model == "facebook/bart-large-cnn":
            self.config = BartConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = BartTokenizer.from_pretrained(model)
        else:
            raise Exception("This model is not supported")

        self.text = str()
예제 #16
0
    def __init__(
        self,
        is_eval=False,
    ):
        super().__init__()

        self.model = BartForConditionalGeneration.from_pretrained('bart-large')

        if is_eval:
            self.model = self.model.eval()

        self.criterion = nn.CrossEntropyLoss(ignore_index=config.PAD_idx)

        if config.use_sgd:
            self.optimizer = torch.optim.SGD(self.parameters(), lr=config.lr)
        else:
            self.optimizer = torch.optim.Adam(self.parameters(), lr=config.lr)

        if config.USE_CUDA:
            self.model = self.model.cuda()

        self.model_dir = config.save_path
        if not os.path.exists(self.model_dir):
            os.makedirs(self.model_dir)
        self.best_path = ""
예제 #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--input_file")
    parser.add_argument("--output_file")
    parser.add_argument(
        "--decoder",
        choices=['greedy', 'beam_search', 'random', 'top_k', 'nucleus'])
    args = parser.parse_args()

    model_name = 'sshleifer/distilbart-xsum-1-1'
    model = BartForConditionalGeneration.from_pretrained(model_name).eval()
    tokenizer = BartTokenizer.from_pretrained(model_name)

    # Iterate through input file documents, generating summaries
    outputs = []
    for line in tqdm.tqdm(jsonlines.open(args.input_file)):
        summary, summary_score = generate_summary(model=model,
                                                  tokenizer=tokenizer,
                                                  document=line['document'],
                                                  decoder=args.decoder)

        outputs.append({
            'id': line['id'],
            'generated_summary': summary,
            'generated_summary_score': summary_score
        })

    # Write out the generated summaries to file
    with open(args.output_file, 'w', encoding='utf-8') as f:
        for l in outputs:
            f.write(json.dumps(l, ensure_ascii=False) + '\n')
예제 #18
0
    def __init__(
        self,
        chkpt_path="/Users/byronwallace/code/RoboSum/weights/pl_title_/pl_title_2048.ckpt"
    ):
        self.model = BartForConditionalGeneration.from_pretrained(
            'facebook/bart-large-cnn')
        self.config = BartConfig.from_pretrained('facebook/bart-large-cnn')
        self.tokenizer = BartTokenizer.from_pretrained(
            'facebook/bart-large-cnn')

        # increase position embeddings from 1024 to 2048
        self.add_position_embeddings()

        # now add special tokens (for title and abstract demarcation)
        # as a general note: we'll assume "abstract" is either the
        # actual abstract of extracted text from the same (i.e., punchlines)
        self.add_special_tokens()

        # now load the checkpoint
        print("loading checkpoint", chkpt_path)
        checkpoint = torch.load(chkpt_path, map_location="cpu")
        print("done")

        cnew = {}
        for key, value in checkpoint['state_dict'].items():
            cnew[".".join(key.split('.')[1:])] = value
        self.model.load_state_dict(cnew)
예제 #19
0
 def __init__(
     self,
     model_name_or_path,
     tokenizer_name,
     model_cache_dir,
     input_max_length,
     target_max_length,
     summary_column_name,
     document_column_name,
     wandb_project,
     wandb_run_name,
     **kwargs,
 ):
     super().__init__(
         input_max_length,
         target_max_length,
         summary_column_name,
         document_column_name,
         wandb_project,
         wandb_run_name,
     )
     self.tokenizer = BartTokenizer.from_pretrained(
         tokenizer_name if tokenizer_name else model_name_or_path,
         cache_dir=model_cache_dir,
     )
     self.model = BartForConditionalGeneration.from_pretrained(
         model_name_or_path,
         cache_dir=model_cache_dir,
     )
예제 #20
0
def train(cli_args: argparse.Namespace) -> None:
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
    model.to(device)

    training_args = TrainingArguments(
        output_dir="./models/bart-coder",
        num_train_epochs=cli_args.epochs,
        per_device_train_batch_size=128,
        per_device_eval_batch_size=128,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        fp16=True,
        remove_unused_columns=True,
        dataloader_num_workers=4,
    )

    print("training on:", training_args.device)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
    )

    trainer.train()
    trainer.save_model()
    print(trainer.evaluate())
예제 #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--user_input', action="store_true")
    args = parser.parse_args()

    print("initializing bart tokenizer...")
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

    print("creating lightseq model...")
    ls_model = lightseq.Transformer("lightseq_bart_base.pb", 128)
    print("creating huggingface model...")
    hf_model = BartForConditionalGeneration.from_pretrained(
        "facebook/bart-base")

    while True:
        if args.user_input:
            sentences = [input("input the masked sentence:\n")]
        else:
            sentences = [
                "I love that girl, but <mask> does not <mask> me.",
                "She is so <mask> that I can not help glance at <mask>.",
                "Nothing's gonna <mask> my love for you.",
                "Drop everything now. Meet me in the pouring <mask>. Kiss me on the sidewalk."
            ]

        print("tokenizing the sentences...")
        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
        inputs_id = inputs["input_ids"]

        ls_generate(ls_model, tokenizer, inputs_id)
        hf_generate(hf_model, tokenizer, inputs_id)

        if not args.user_input:
            break
예제 #22
0
    def model(self):
        """Only load the model if needed."""

        model = BartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro").to(torch_device)
        if "cuda" in torch_device:
            model = model.half()
        return model
예제 #23
0
    def setUpClass(cls):
        # summarization
        # generate yes beam search
        # Note for BART summarization in transformers repo, beam search performs much better
        #  than no beam search, but even their beam search with num_beams=1 is better, implying that something
        #  is broken in the _generate_no_beam_search function

        # see ``examples/summarization/bart/evaluate_cnn.py`` for a longer example
        cls.model = BartForConditionalGeneration.from_pretrained(
            'bart-large-cnn')
        cls.tokenizer = BartTokenizer.from_pretrained('bart-large-cnn')

        cls.decoding_hyperparams = {'max_length': 40, 'num_beams': 3}

        cls.test_news_article_1 = 'New Zealand says it has stopped community transmission of Covid-19, ' \
                            'effectively eliminating the virus. With new cases in single figures for several days - one on Sunday ' \
                            '- Prime Minister Jacinda Ardern said the virus was "currently" eliminated. But officials have warned ' \
                            'against complacency, saying it does not mean a total end to new coronavirus cases. ' \
                            'The news comes hours before New Zealand is set to move out of its toughest level of social restrictions. ' \
                            'From Tuesday, some non-essential business, healthcare and education activity will be able to resume. ' \
                            'Most people will still be required to remain at home at all times and avoid all social interactions.'

        cls.test_news_article_2 = \
            'But officials have warned against complacency, saying it does not mean a total end to new HIV cases. ' \
            'Most people will still be required to remain at home at all times and avoid all social interactions.' \
            'Germany says it has stopped community transmission of HIV, ' \
            'effectively eliminating the virus. With new cases in single figures for several days - one on Sunday ' \
            '- Prime Minister Angela Merkle said the virus was "currently" eliminated. ' \
            'From Tuesday, some non-essential business, healthcare and education activity will be able to resume. ' \
            'The news comes hours before Germany is set to move out of its toughest level of social restrictions. '
예제 #24
0
    def __init__(self,
                 datamodule,
                 learning_rate=3e-5,
                 batch_size=8,
                 optimizer='adam',
                 dataset='',
                 pre_trained=''):
        super(KGQGTuner, self).__init__()

        if pre_trained == 't5':
            self.model = T5ForConditionalGeneration.from_pretrained('t5-base')
        elif pre_trained == 'bart':
            self.model = BartForConditionalGeneration.from_pretrained(
                'facebook/bart-base')
        else:
            raise Exception(
                f'Unknown pre-trained model {pre_trained}, choose t5 or bart.')

        # resize embedding to account for additional special tokens
        self.tokenizer = datamodule.tokenizer
        self.model.resize_token_embeddings(len(self.tokenizer))

        self.learning_rate = learning_rate

        # add batch size to init to enable automatic batch size scaling.
        self.batch_size = datamodule.batch_size
        #self.dataset = dataset
        self.optimizer = optimizer

        # testing
        self.bleu_metric = bleu_score

        self.save_hyperparameters('learning_rate', 'batch_size', 'optimizer',
                                  'dataset', 'pre_trained')
예제 #25
0
def generate_summaries(
    examples: list, out_file: str, model_name: str, batch_size: int = 8, device: str = DEFAULT_DEVICE
):
    fout = Path(out_file).open("w")
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn', output_past=True).to(device)
    tokenizer = BartTokenizer.from_pretrained("bart-large-cnn")

    max_length = 140
    min_length = 55

    for batch in tqdm(list(chunks(examples, batch_size))):
        dct = tokenizer.batch_encode_plus(batch, max_length=64, return_tensors="pt", pad_to_max_length=True)
        print(dct["input_ids"][0])
        print(dct["attention_mask"][0])
        summaries = model.generate(
            input_ids=dct["input_ids"].to(device),
            attention_mask=dct["attention_mask"].to(device),
            num_beams=4,
            length_penalty=10.0,
            repetition_penalty = 5.0,
            max_length=20,  # +2 from original because we start at step=1 and stop before max_length
            #min_length=min_length + 1,  # +1 from original because we start at step=1
            no_repeat_ngram_size=3,
            early_stopping=True,
        #    decoder_start_token_id=model.config.eos_token_id,
        )
        dec = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summaries]
        in_ids = dct["input_ids"].to(device)
        in_dec = [tokenizer.decode(id, skip_special_tokens=True, clean_up_tokenization_spaces=False) for id in in_ids]
        for input, hypothesis in zip(in_dec, dec):
            fout.write(input + ' ||| ' + hypothesis + "\n")
            fout.flush()
예제 #26
0
    def __init__(self, init='bart.large', shared_training='decoder'):
        self._model = BARTMultiGPUWrapper(model_name=init)

        # SSL model
        self._ssl_model = BartForConditionalGeneration.from_pretrained(
            pretrained_model_name_or_path=SSL_MODEL_NAME)
        self._ssl_model = self._ssl_model.cuda()

        if shared_training == 'encoder':
            share_bart_encoder_layers(self._model, self._ssl_model)
            print(
                'Dialog generation task and SSL task are using the same BART encoder.'
            )
        else:
            share_bart_decoder_layers(self._model, self._ssl_model)
            print(
                'Dialog generation task and SSL task are using the same BART decoder.'
            )

        self._optimizer = None
        self._lr_scheduler = None
        self._global_step = 0

        self._dataset = {}

        self._log_dir = None
        self._eval_steps = None
        self._log_file = None
        self._best_dev_loss = None
예제 #27
0
    def test_xsum_summarization_same_as_fairseq(self):
        model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-xsum").to(torch_device)
        self.assertFalse(model.config.is_valid_mbart())
        tok = BartTokenizer.from_pretrained("facebook/bart-large")

        PGE_ARTICLE = """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
        EXPECTED_SUMMARY = "California's largest power company has begun shutting off power to tens of thousands of homes and businesses in the state."
        dct = tok.batch_encode_plus(
            [PGE_ARTICLE], max_length=1024, padding="max_length", truncation=True, return_tensors="pt",
        ).to(torch_device)

        hypotheses_batch = model.generate(
            input_ids=dct["input_ids"],
            attention_mask=dct["attention_mask"],
            num_beams=2,
            max_length=62,
            min_length=11,
            length_penalty=1.0,
            no_repeat_ngram_size=3,
            early_stopping=True,
            decoder_start_token_id=model.config.eos_token_id,
        )

        decoded = tok.batch_decode(hypotheses_batch, skip_special_tokens=True,)
        self.assertEqual(EXPECTED_SUMMARY, decoded[0])
    def load_model_tokenizer(self, pretrained):
        """ Load transformer model and tokenizer for given pre-trained name 
        
        :param pretrained: pre-trained name
        :return: model, tokenizer
        """
        
        model = None
        tokenizer = None
        
        if self.method == "T5":
            if pretrained in T5_PRETRAINED_MODELS:
                model = T5ForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = T5Tokenizer.from_pretrained(pretrained)
        elif self.method == "BART":
            if pretrained in BART_PRETRAINED_MODELS:
                model = BartForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = BartTokenizer.from_pretrained(pretrained)
        elif self.method == "GPT-2":
            if pretrained in GPT2_PRETRAINED_MODELS:
                model = GPT2LMHeadModel.from_pretrained(pretrained)
                model.config.max_length = self.max_length
                tokenizer = GPT2Tokenizer.from_pretrained(pretrained)
        elif self.method == "XLM":
            if pretrained in XLM_PRETRAINED_MODELS:
                model = XLMWithLMHeadModel.from_pretrained(pretrained)
                model.config.max_length = self.max_length
                tokenizer = XLMTokenizer.from_pretrained(pretrained)
        else:
            pass

        return model, tokenizer
예제 #29
0
 def pre_init(self, hparams):
     self.output_dir = Path(hparams.output_dir)
     self.output_dir.mkdir(exist_ok=True)
     teacher = BartForConditionalGeneration.from_pretrained(
         hparams.teacher).eval()
     student_updates = {
         "decoder_layers": hparams.student_decoder_layers,
         "encoder_layers": hparams.student_encoder_layers,
     }
     if hparams.length_penalty != -1:
         student_updates["length_penalty"] = hparams.length_penalty
     d_layers_to_copy = get_layers_to_copy(
         student_updates["decoder_layers"], teacher.config.decoder_layers)
     e_layers_to_copy: List = get_layers_to_copy(
         student_updates["encoder_layers"], teacher.config.encoder_layers)
     hparams.d_layer_to_copy = d_layers_to_copy
     hparams.e_layer_to_copy = e_layers_to_copy
     kw = teacher.config.to_diff_dict()
     kw.update(student_updates)
     # Copy weights
     student_cfg = BartConfig(**kw)
     student = BartForConditionalGeneration(student_cfg)
     student, _ = init_student(student, teacher)
     save_dir = self.output_dir.joinpath("student")
     self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams,
                          student, teacher)
     student.save_pretrained(save_dir)
     hparams.model_name_or_path = str(save_dir)
     return student, student_cfg, teacher
예제 #30
0
def get_summary(text, model, tokenizer, torch_device):
    """
    Get summary
    """

    tokenizer_summarize = BartTokenizer.from_pretrained("bart-large-cnn")
    model_summarize = BartForConditionalGeneration.from_pretrained("bart-large-cnn").to(
        torch_device
    )

    model_summarize.to(torch_device)
    # Set the model in evaluation mode to deactivate the DropOut modules
    model_summarize.eval()

    answers_input_ids = tokenizer_summarize.batch_encode_plus(
        [text], return_tensors="pt", max_length=1024
    )["input_ids"]

    answers_input_ids = answers_input_ids.to(torch_device)

    summary_ids = model_summarize.generate(
        answers_input_ids, num_beams=4, max_length=5, early_stopping=True
    )

    return tokenizer_summarize.decode(
        summary_ids.squeeze(),
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )