Exemplo n.º 1
0
    def __init__(self,
                 vocab: Vocabulary,
                 model_name: str = "bert-base",
                 multi_choice: bool = False):
        super().__init__(vocab)
        self._model = None
        self._loss = CrossEntropyLoss()
        self.is_multi_choice = multi_choice

        if model_name.startswith('bert'):
            if self.is_multi_choice:
                self._model = BertMultiChoiceMLM.from_pretrained(model_name)
            else:
                self._model = BertForMaskedLM.from_pretrained(model_name)
        elif 'roberta' in model_name:
            if self.is_multi_choice:
                self._model = RobertaMultiChoiceMLM.from_pretrained(model_name)
            else:
                self._model = RobertaForMaskedLM.from_pretrained(model_name)

        elif 'albert' in model_name:
            self._model = AlbertForMaskedLM.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self._model = XLNetLMHeadModel.from_pretrained(model_name)
        else:
            raise ("Riquiered model is not supported.")
Exemplo n.º 2
0
    def __init__(self, config, dataset):
        super(XLNet, self).__init__(config, dataset)

        self.eval_generate_num = config['eval_generate_num']

        self.tokenizer = XLNetTokenizer.from_pretrained(
            'xlnet-base-cased',
            bos_token=dataset.sos_token,
            eos_token=dataset.eos_token,
            pad_token=dataset.padding_token,
            unk_token=dataset.eos_token)

        self.configuration = XLNetConfig.from_pretrained('xlnet-base-cased')

        self.decoder = XLNetLMHeadModel.from_pretrained(
            'xlnet-base-cased', config=self.configuration)
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.sos_token = dataset.sos_token
        self.eos_token = dataset.eos_token
        self.mask_token = '<mask>'
        self.padding_token_idx = self.tokenizer.pad_token_id
        self.max_seq_length = config['max_seq_length']
        self.device = config["device"]

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
Exemplo n.º 3
0
    def __init__(self, args):
        super().__init__()

        self.load_model = args.load_model

        if "xlnet" in args.load_model:
            self.tokenizer = AutoTokenizer.from_pretrained(self.load_model)
            self.model = XLNetLMHeadModel.from_pretrained(self.load_model,
                                                          mem_len=1024).to(
                                                              args.device)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.load_model)
            config = AutoConfig.from_pretrained(self.load_model)
            config.output_hidden_states = True
            self.model = AutoModelWithLMHead.from_pretrained(self.load_model,
                                                             config=config).to(
                                                                 args.device)

        hidden_size = 1024 if "large" in self.load_model or self.load_model == "gpt2-medium" else 768

        self.hidden2label = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2), nn.Sigmoid(),
            nn.Linear(hidden_size // 2, 2)).to(args.device)

        # self.hidden2label = nn.Linear(hidden_size, 2).to(args.device)
        self.dropout = torch.nn.Dropout(args.dropout)
        self.layer = args.bert_layer

        self.eval()
        self.device = args.device
        self.args = args
Exemplo n.º 4
0
    def __init__(self,
                 model_path='xlnet-base-cased',
                 temperature=1.0,
                 top_k=None,
                 top_p=None,
                 padding_text=None,
                 optimize=None,
                 device=None):
        super().__init__(device,
                         temperature=temperature,
                         top_k=top_k,
                         top_p=top_p,
                         optimize=optimize)
        self.model_path = model_path

        self.tokenizer = XLNetTokenizer.from_pretrained(model_path)
        # TODO: Evaluted to use mems in XLNet but the result is quite weird.
        self.optimize['external_memory'] = 0
        self.model = XLNetLMHeadModel.from_pretrained(
            model_path, mem_len=self.optimize['external_memory'])

        self.padding_text_idxes = self.tokenizer.encode(padding_text
                                                        or self.PADDING_TEXT)

        self.model.to(self.device)
        self.model.eval()
Exemplo n.º 5
0
    def __init__(self,
                 model_path='xlnet-base-cased',
                 temperature=1.0,
                 top_k=None,
                 top_p=None,
                 padding_text=None,
                 optimize=None,
                 device=None):
        super().__init__(device,
                         temperature=temperature,
                         top_k=top_k,
                         top_p=top_p,
                         optimize=optimize)
        try:
            import transformers
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                'Missed transformers library. Install transfomers by `pip install transformers`'
            )

        self.model_path = model_path

        # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        # self.model = AutoModel.from_pretrained(model_path)
        # TODO: Evaluted to use mems in XLNet but the result is quite weird.
        self.optimize['external_memory'] = 0
        self.tokenizer = XLNetTokenizer.from_pretrained(model_path)
        self.model = XLNetLMHeadModel.from_pretrained(
            model_path, mem_len=self.optimize['external_memory'])

        self.padding_text_idxes = self.tokenizer.encode(padding_text
                                                        or self.PADDING_TEXT)

        self.model.to(self.device)
        self.model.eval()
Exemplo n.º 6
0
 def register_model(self) -> NoReturn:
     """
     If the model is not registered this method creates that model and
     places it to the model register. If the model is registered just
     increments model reference count. This method helps to save computational resources
     e.g. when combining model prediction with embedding similarity by not loading into
     memory same model twice.
     """
     if self.model_name not in XLNetProbEstimator.loaded:
         model = XLNetLMHeadModel.from_pretrained(self.model_name)
         model.to(self.device)
         model.eval()
         tokenizer = XLNetTokenizer.from_pretrained(self.model_name)
         word2id = self._get_word2id(tokenizer)
         spiece_ids = [
             idx for word, idx in word2id.items()
             if word.startswith(self.NON_START_SYMBOL)
         ]
         all_special_ids = tokenizer.all_special_ids
         word_embeddings = model.transformer.word_embedding.weight.data.cpu(
         ).numpy()
         XLNetProbEstimator.loaded[self.model_name] = {
             "model": model,
             "tokenizer": tokenizer,
             "embeddings": word_embeddings,
             "word2id": word2id,
             "spiece_ids": spiece_ids,
             "all_special_ids": all_special_ids,
         }
         XLNetProbEstimator.loaded[self.model_name]["ref_count"] = 1
     else:
         XLNetProbEstimator.loaded[self.model_name]["ref_count"] += 1
Exemplo n.º 7
0
def main(raw_args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name",
                        type=str,
                        required=True,
                        help="model name e.g. xlnet-tiny-chinese")
    parser.add_argument("--cache_dir",
                        type=str,
                        default=None,
                        required=False,
                        help="Directory containing pytorch model")
    parser.add_argument("--pytorch_model_path",
                        type=str,
                        required=True,
                        help="/path/to/<pytorch-model-name>.bin")
    parser.add_argument("--tf_cache_dir",
                        type=str,
                        required=True,
                        help="Directory in which to save tensorflow model")
    args = parser.parse_args(raw_args)

    # model = XLNetLMHeadModel.from_pretrained(
    #     pretrained_model_name_or_path=args.model_name,
    #     state_dict=torch.load(args.pytorch_model_path),
    #     cache_dir=args.cache_dir
    # )
    model = XLNetLMHeadModel.from_pretrained(
        pretrained_model_name_or_path=args.cache_dir)

    convert_pytorch_checkpoint_to_tf(model=model,
                                     ckpt_dir=args.tf_cache_dir,
                                     model_name=args.model_name)
Exemplo n.º 8
0
    def __init__(self, config, dataset):
        super(XLNet, self).__init__(config, dataset)

        self.pretrained_model_path = config['pretrained_model_path']
        self.tokenizer = XLNetTokenizer.from_pretrained(
            self.pretrained_model_path,
            bos_token=dataset.sos_token,
            eos_token=dataset.eos_token,
            pad_token=dataset.padding_token)

        self.sos_token = self.tokenizer.bos_token
        self.eos_token = self.tokenizer.eos_token
        self.sos_token_idx = self.tokenizer.bos_token_id
        self.eos_token_idx = self.tokenizer.eos_token_id
        self.padding_token_idx = self.tokenizer.pad_token_id

        self.configuration = XLNetConfig.from_pretrained(
            self.pretrained_model_path,
            bos_token_id=self.sos_token_idx,
            eos_token_id=self.eos_token_idx,
            pad_token_id=self.padding_token_idx)

        self.decoder = XLNetLMHeadModel.from_pretrained(
            self.pretrained_model_path, config=self.configuration)
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
Exemplo n.º 9
0
def config():
    parser = ArgumentParser()
    # basic
    parser.add_argument('--file_dir',
                        type=str,
                        default=None,
                        help="data directory")
    parser.add_argument('--ids_file',
                        type=str,
                        default=None,
                        help="list of ids to eval")
    parser.add_argument('--id',
                        type=str,
                        default=None,
                        help="single setting to evaluate")
    parser.add_argument('--parsed_file', type=str, default=None, help='')
    parser.add_argument('--accept_name',
                        type=str,
                        default='xlnet',
                        help='bert or xlnet')

    args = parser.parse_args()

    model_name = 'xlnet-large-cased'
    args.tokenizer = XLNetTokenizer.from_pretrained(model_name)
    args.acpt_model = XLNetLMHeadModel.from_pretrained(model_name)

    args.device = torch.device('cuda:0')
    args.acpt_model.to(args.device)
    args.acpt_model.eval()

    return args
def main(train_epoch, batch_size, seq_length, lr, corpus_path, vocab_path,
         config_path, pretrain_model_path, output_record_path,
         model_save_path):
    seed_everything(997)
    num_train_epochs = train_epoch
    pretrain_batch_size = batch_size
    seq_length = seq_length
    lr = lr
    corpus_path = corpus_path
    vocab_path = vocab_path
    config_path = config_path
    pretrain_model_path = pretrain_model_path
    output_record_path = output_record_path
    model_save_path = model_save_path

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    #     train_dataset = LineByLineTextDataset(block_size=128, file_path=corpus_path, tokenizer=tokenizer)

    #     data = read_data(corpus_path, tokenizer)
    train_dataset = OppoDataset(train_file_path=corpus_path,
                                tokenizer=tokenizer,
                                maxlen=128)

    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer)

    config = XLNetConfig.from_pretrained(
        pretrained_model_name_or_path=config_path)
    #     model = XLNetForMaskedLM(config=config,name='./xlnet_model/pytorch_model.bin')
    if os.path.exists(pretrain_model_path):
        model = XLNetLMHeadModel.from_pretrained(pretrain_model_path,
                                                 config=config)
    else:
        model = XLNetLMHeadModel(config=config)


#     data_collator = Collator(max_seq_len=seq_length, tokenizer=tokenizer, mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir=output_record_path,
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        learning_rate=lr,
        dataloader_num_workers=8,
        prediction_loss_only=True,
        fp16=True,
        fp16_backend='amp',
        per_device_train_batch_size=pretrain_batch_size,
        save_strategy='no',
        seed=997)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset)

    trainer.train()
    trainer.save_model(model_save_path)
Exemplo n.º 11
0
    def __init__(self):
        cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
        path = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 shell=True).communicate()[0]).decode('utf-8')
        self.m = MeCab.Tagger(f"-Owakati -d {path}")
        logger.info("mecab loaded")

        self.model_dir = "hajime9652/xlnet-japanese"
        # self.model_dir = "./backend/PyTorch"

        self.gen_model = XLNetLMHeadModel.from_pretrained(self.model_dir)
        self.gen_tokenizer = XLNetTokenizer.from_pretrained(self.model_dir)
Exemplo n.º 12
0
    def test_embedding_lm(self):
        # try original model
        lmmodel = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')
        lm_outputs = lmmodel(self.input)
        last_hidden_states_lm = lm_outputs[
            0]  # The last hidden-state is the first element of the output tuple

        # try our version
        embed_outs_lm = self.embed_model.lm(self.input)
        last_embedding_lm = embed_outs_lm[0]
        assert torch.all(
            torch.eq(last_embedding_lm,
                     last_hidden_states_lm)), "LM embeddings were not the same"
Exemplo n.º 13
0
    def __init__(self, model_path='xlnet-base-cased', temperature=1.0, top_k=None, top_p=None, padding_text=None,
                 device=None, return_past=False):
        super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
        self.model_path = model_path

        self.tokenizer = XLNetTokenizer.from_pretrained(model_path)
        self.model = XLNetLMHeadModel.from_pretrained(model_path)

        self.padding_text_idxes = self.tokenizer.encode(padding_text or self.PADDING_TEXT)

        self.model.to(self.device)
        self.model.eval()

        self.return_past = return_past
Exemplo n.º 14
0
def run_mlm_mask_accuracy(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # These do the same thing, except XLNet is a less popular model so it's not supported by
    # all AutoModel variants.
    if 'xlnet' in model_name:
        model = XLNetLMHeadModel.from_pretrained(model_name)
    else:
        model = AutoModelForMaskedLM.from_pretrained(model_name)

    # Make binary choice for a single sentence pair
    def mlm_sentence_pair(sent1, sent2):
        masked_toks, masked_ix, dtok1, dtok2 = get_masked_sequence(
            tokenizer, sent1, sent2)
        logit1 = model(torch.tensor([masked_toks])).logits[0, masked_ix, dtok1]
        logit2 = model(torch.tensor([masked_toks])).logits[0, masked_ix, dtok2]
        return bool(logit1 > logit2)

    sent_pairs = get_common_sentences()
    for task_name, sents in sent_pairs.items():
        res = [mlm_sentence_pair(s1, s2) for (s1, s2) in sents]
        acc = sum(res) / len(sents)
        print(task_name, acc)
Exemplo n.º 15
0
    def test_lm_generate_xlnet_base_cased(self):
        model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")
        input_ids = torch.Tensor(
            [
                [
                    67,
                    2840,
                    19,
                    18,
                    1484,
                    20,
                    965,
                    29077,
                    8719,
                    1273,
                    21,
                    45,
                    273,
                    17,
                    10,
                    15048,
                    28,
                    27511,
                    21,
                    4185,
                    11,
                    41,
                    2444,
                    9,
                    32,
                    1025,
                    20,
                    8719,
                    26,
                    23,
                    673,
                    966,
                    19,
                    29077,
                    20643,
                    27511,
                    20822,
                    20643,
                    19,
                    17,
                    6616,
                    17511,
                    18,
                    8978,
                    20,
                    18,
                    777,
                    9,
                    19233,
                    1527,
                    17669,
                    19,
                    24,
                    673,
                    17,
                    28756,
                    150,
                    12943,
                    4354,
                    153,
                    27,
                    442,
                    37,
                    45,
                    668,
                    21,
                    24,
                    256,
                    20,
                    416,
                    22,
                    2771,
                    4901,
                    9,
                    12943,
                    4354,
                    153,
                    51,
                    24,
                    3004,
                    21,
                    28142,
                    23,
                    65,
                    20,
                    18,
                    416,
                    34,
                    24,
                    2958,
                    22947,
                    9,
                    1177,
                    45,
                    668,
                    3097,
                    13768,
                    23,
                    103,
                    28,
                    441,
                    148,
                    48,
                    20522,
                    19,
                    12943,
                    4354,
                    153,
                    12860,
                    34,
                    18,
                    326,
                    27,
                    17492,
                    684,
                    21,
                    6709,
                    9,
                    8585,
                    123,
                    266,
                    19,
                    12943,
                    4354,
                    153,
                    6872,
                    24,
                    3004,
                    20,
                    18,
                    9225,
                    2198,
                    19,
                    12717,
                    103,
                    22,
                    401,
                    24,
                    6348,
                    9,
                    12943,
                    4354,
                    153,
                    1068,
                    2768,
                    2286,
                    19,
                    33,
                    104,
                    19,
                    176,
                    24,
                    9313,
                    19,
                    20086,
                    28,
                    45,
                    10292,
                    9,
                    4,
                    3,
                ]
            ]
        ).long()
        #  In 1991, the remains of Russian Tsar Nicholas II and his family
        #  (except for Alexei and Maria) are discovered.
        #  The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
        #  remainder of the story. 1883 Western Siberia,
        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic.
        #  Rasputin has a vision and denounces one of the men as a horse thief. Although his
        #  father initially slaps him for making such an accusation, Rasputin watches as the
        #  man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
        #  the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
        #  with people, even a bishop, begging for his blessing. """

        expected_output_ids = [
            67,
            2840,
            19,
            18,
            1484,
            20,
            965,
            29077,
            8719,
            1273,
            21,
            45,
            273,
            17,
            10,
            15048,
            28,
            27511,
            21,
            4185,
            11,
            41,
            2444,
            9,
            32,
            1025,
            20,
            8719,
            26,
            23,
            673,
            966,
            19,
            29077,
            20643,
            27511,
            20822,
            20643,
            19,
            17,
            6616,
            17511,
            18,
            8978,
            20,
            18,
            777,
            9,
            19233,
            1527,
            17669,
            19,
            24,
            673,
            17,
            28756,
            150,
            12943,
            4354,
            153,
            27,
            442,
            37,
            45,
            668,
            21,
            24,
            256,
            20,
            416,
            22,
            2771,
            4901,
            9,
            12943,
            4354,
            153,
            51,
            24,
            3004,
            21,
            28142,
            23,
            65,
            20,
            18,
            416,
            34,
            24,
            2958,
            22947,
            9,
            1177,
            45,
            668,
            3097,
            13768,
            23,
            103,
            28,
            441,
            148,
            48,
            20522,
            19,
            12943,
            4354,
            153,
            12860,
            34,
            18,
            326,
            27,
            17492,
            684,
            21,
            6709,
            9,
            8585,
            123,
            266,
            19,
            12943,
            4354,
            153,
            6872,
            24,
            3004,
            20,
            18,
            9225,
            2198,
            19,
            12717,
            103,
            22,
            401,
            24,
            6348,
            9,
            12943,
            4354,
            153,
            1068,
            2768,
            2286,
            19,
            33,
            104,
            19,
            176,
            24,
            9313,
            19,
            20086,
            28,
            45,
            10292,
            9,
            4,
            3,
            1722,
            19,
            24,
            6348,
            61,
            977,
            176,
            1772,
            33,
            45,
            970,
            19,
            4185,
            19,
            27,
            442,
            22,
            2771,
            4901,
            25,
            18,
            2059,
            20,
            24,
            303,
            1775,
            691,
            9,
            1147,
            19,
            634,
            19,
            43,
            51,
            54,
            6157,
            2999,
            33,
            4185,
        ]
        #  In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria)
        #  are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich,
        #  narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin
        #  is asked by his father and a group of men to perform magic. Rasputin has a vision and
        #  denounces one of the men as a horse thief. Although his father initially slaps
        #  him for making such an accusation, Rasputin watches as the man is chased outside and beaten.
        #  Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest.
        #  Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing.
        #  1990, a priest who cannot even walk with his wife, Maria, is asked to perform magic
        #  in the presence of a local religious leader.
        #  Since, however, he has had difficulty walking with Maria

        torch.manual_seed(0)
        output_ids = model.generate(
            input_ids,
            bos_token_id=self.special_tokens["bos_token_id"],
            pad_token_id=self.special_tokens["pad_token_id"],
            eos_token_ids=self.special_tokens["eos_token_id"],
            max_length=200,
        )

        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
Exemplo n.º 16
0
import torch
from transformers import XLNetTokenizer, XLNetLMHeadModel

import logging
logging.basicConfig(level=logging.INFO)

tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = torch.tensor(
    tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(
        0)  # We will predict the masked token
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]),
                        dtype=torch.float)
perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
target_mapping = torch.zeros(
    (1, 1, input_ids.shape[1]),
    dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
target_mapping[
    0, 0,
    -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
next_token_logits = outputs[
    0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]

print(next_token_logits)
print(next_token_logits.shape)
predicted_index = torch.argmax(next_token_logits).item()
print(predicted_index)
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
print(predicted_token)
Exemplo n.º 17
0
 def _get_masked_language_model(self):
     """
     Initializes the XLNetLMHeadModel transformer
     """
     self.mlm = XLNetLMHeadModel.from_pretrained(self.model)
     self.mlm.eval()
Exemplo n.º 18
0
        sentence_best_word_probs.append(best_word_prob)
        best_words.append(
            model_tokenizer.convert_ids_to_tokens(
                predicted_prob.argmax().item()))

    return (sentence_word_probs, sentence_best_word_probs, best_words)


######################################################
### Compute XLNet scores
######################################################

for XLNET_MODEL in tqdm(['xlnet-base-cased', 'xlnet-large-cased']):

    model_tokenizer = XLNetTokenizer.from_pretrained(XLNET_MODEL)
    model = XLNetLMHeadModel.from_pretrained(XLNET_MODEL)

    if torch.cuda.is_available():
        model = model.cuda()

    model = model.eval()

    for dial in tqdm(itertools.chain(convai1_data, convai2_data),
                     total=convai_data_len):
        utterances = dial['utterances']

        sentences_word_probs = list()
        sentences_best_word_probs = list()
        sentences_best_words = list()

        for u1, u2 in zip(utterances[:-1], utterances[1:]):
Exemplo n.º 19
0
# %%
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import ElectraTokenizer, ElectraForMaskedLM
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch
import string

from transformers import BertTokenizer, BertForMaskedLM

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval()

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased').eval()

xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
    'xlm-roberta-base').eval()

bart_tokenizer = BartTokenizer.from_pretrained('bart-large')
bart_model = BartForConditionalGeneration.from_pretrained('bart-large').eval()

electra_tokenizer = ElectraTokenizer.from_pretrained(
    'google/electra-small-generator')
electra_model = ElectraForMaskedLM.from_pretrained(
    'google/electra-small-generator').eval()

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval()
Exemplo n.º 20
0
    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    model_path = f'../checkpoints/xlnet_maskedlm/{args.dataset}'
    args.model_path = model_path
    if args.generate_mode == 0:
        print('construct data with masked lm.')
    elif args.generate_mode == 1:
        print('construct data with random sampling.')
    else:
        print('construct data with masked lm and random sampling.')

    try:
        # load the pre-trained model and tokenizer
        tokenizer = XLNetTokenizer.from_pretrained(args.model_path)
        model = XLNetLMHeadModel.from_pretrained(args.model_path)
        print('Initialize XLNet from checkpoint {}.'.format(args.model_path))
    except:
        tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
        model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')
        print('Initialize XLNet with default parameters.')
    model.eval()

    model.to('cuda')

    for mode in ['validation', 'train']:
        if mode == 'train':
            dataset_size = args.train_dataset_size
        else:
            dataset_size = args.validation_dataset_size
Exemplo n.º 21
0
    parser.add_argument('--data_start', type=float, default=0, help='start point of data in 0-1 for DUC or TAC')
    parser.add_argument('--data_end', type=float, default=1, help='end point of data in 0-1 for DUC or TAC')

    parser.add_argument('--save_freq', type=int, default=1)
    parser.add_argument('--debug', action='store_true')

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()

    # XLNet models
    tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model)
    model = XLNetLMHeadModel.from_pretrained(args.xlnet_model)
    if args.gpu_parallel:
        model = nn.DataParallel(model).cuda()
    else:
        cuda_dev = torch.device('cuda:{}'.format(args.gpu_id))
        model = model.cuda(cuda_dev)
    model.train(False)

    # spaCy: used for merge noun chunks & name entities
    spacy.prefer_gpu()
    nlp = spacy.load(args.spacy_model)

    def merge_entities_and_nouns(doc, ret=True):
        assert doc.is_parsed
        with doc.retokenize() as retokenizer:
            seen_words = set()
Exemplo n.º 22
0
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch
import torch.nn.functional as F
tokenizer = XLNetTokenizer.from_pretrained('./model/spbpe')
model = XLNetLMHeadModel.from_pretrained('./model/spbpe')
model.resize_token_embeddings(len(tokenizer))

tokens = tokenizer.encode("在一件申请需要分案的情<mask>")
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = torch.tensor(tokens).unsqueeze(0)  # We will predict the masked token
# print(input_ids)

perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token

target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)

outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
# next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
# print(next_token_logits)

predicted_index = torch.argmax(outputs[0][0]).item()
predicted_token = tokenizer.convert_ids_to_tokens(predicted_index)
print(predicted_token)



tokens = tokenizer.encode("被侵害人,是因自己的人身、财产、名<mask>")
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = torch.tensor(tokens).unsqueeze(0)  # We will predict the masked token
Exemplo n.º 23
0
        masked_lm = None
        mode = 0
    elif args.model_name =='XLNetLMGenerate':

        forward_lm_path = '../checkpoints/forward_xlnet/{}'.format(args.dataset)
        args.forward_lm_path = forward_lm_path

        backward_lm_path = '../checkpoints/backward_xlnet/{}'.format(args.dataset)
        args.backward_lm_path = backward_lm_path

        masked_lm_path = '../checkpoints/xlnet_maskedlm/{}'.format(args.dataset)
        args.masked_lm_path = masked_lm_path

        forward_lm_tokenizer = XLNetTokenizer.from_pretrained(forward_lm_path)
        forward_lm = XLNetLMHeadModel.from_pretrained(forward_lm_path)
        logger.logger.info('Initialize forward XLNet LM from checkpoint {}.'.format(forward_lm_path))

        # backward_lm_tokenizer = XLNetTokenizer.from_pretrained(backward_lm_path)
        backward_lm = XLNetLMHeadModel.from_pretrained(backward_lm_path)
        logger.logger.info('Initialize backward XLNet LM from checkpoint {}.'.format(backward_lm_path))

        if args.generate_candidate_method==3:
            masked_lm =XLNetLMHeadModel.from_pretrained(masked_lm_path)
            logger.logger.info('Initialize masked XLNet LM from checkpoint {}.'.format(masked_lm_path))
        else:
            masked_lm = None
        mode = 1
    else:
        raise ValueError('wrong model type.')
Exemplo n.º 24
0
def add_arguments():
    parser = ArgumentParser()

    # basic
    parser.add_argument('--do_train', action='store_true', help="do training")
    parser.add_argument('--do_test',
                        action='store_true',
                        help="do independent test")
    parser.add_argument('--do_cond_test',
                        action='store_true',
                        help="do test for conditional generation")

    parser.add_argument('--input_file', type=str, default=None, help="")
    parser.add_argument('--dev_file', type=str, default=None, help="")
    parser.add_argument('--test_file', type=str, default=None, help="")
    parser.add_argument('--vocab_file', type=str, default=None, help="")
    parser.add_argument('--emb_file', type=str, default=None, help="")
    parser.add_argument('--output_dir', type=str, default=None, help="")
    parser.add_argument('--attention',
                        action='store_true',
                        help='whether use attention in seq2seq')
    parser.add_argument('--cls_attention', action='store_true', help="")
    parser.add_argument('--cls_attention_size', type=int, default=300, help="")

    # hyper-parameters
    parser.add_argument('--batch_size', type=int, default=32, help="")
    parser.add_argument('--num_epochs', type=int, default=5, help="")
    parser.add_argument('--learning_rate', type=float, default=0.001, help="")
    parser.add_argument('--enc_type', type=str, default='bi', help="")
    parser.add_argument('--enc_num_units', type=int, default=512, help="")
    parser.add_argument('--enc_layers', type=int, default=2, help="")
    parser.add_argument('--dec_num_units', type=int, default=512, help="")
    parser.add_argument('--dec_layers', type=int, default=2, help="")
    parser.add_argument('--epochs', type=int, default=2, help="")
    parser.add_argument("--max_gradient_norm",
                        type=float,
                        default=5.0,
                        help="Clip gradients to this norm.")
    parser.add_argument('--max_to_keep', type=int, default=5, help="")
    parser.add_argument(
        '--lowest_bound_score',
        type=float,
        default=10.0,
        help="Stop the training once achieving the lowest_bound_score")

    parser.add_argument('--beam_width', type=int, default=0, help="")
    parser.add_argument("--num_buckets",
                        type=int,
                        default=5,
                        help="Put data into similar-length buckets.")
    parser.add_argument("--max_len",
                        type=int,
                        default=50,
                        help="Lenth max of input sentences")
    parser.add_argument('--tgt_min_len',
                        type=int,
                        default=0,
                        help='Length min of target sentences')

    # training control
    parser.add_argument('--print_every_steps', type=int, default=1, help="")
    parser.add_argument('--save_every_epoch', type=int, default=1, help="")
    parser.add_argument(
        '--stop_steps',
        type=int,
        default=20000,
        help="number of steps of non-improve to terminate training")
    parser.add_argument('--total_steps',
                        type=int,
                        default=None,
                        help="total number of steps for training")
    parser.add_argument('--random_seed', type=int, default=1, help="")
    parser.add_argument('--num_gpus', type=int, default=0, help="")
    parser.add_argument('--save_checkpoints',
                        action='store_true',
                        help='Whether save models while training')

    # classification
    parser.add_argument('--classification',
                        action='store_true',
                        help="Perform classification")
    parser.add_argument('--classification_model',
                        type=str,
                        default='RNN',
                        help='')
    parser.add_argument('--output_classes',
                        type=int,
                        default=2,
                        help="number of classes for classification")
    parser.add_argument('--output_file',
                        type=str,
                        default=None,
                        help="Classification output for train set")
    parser.add_argument('--dev_output',
                        type=str,
                        default=None,
                        help="Classification output for dev set")
    parser.add_argument('--test_output',
                        type=str,
                        default=None,
                        help="Classification output for test set")
    parser.add_argument('--filter_sizes',
                        nargs='+',
                        default=[5, 3],
                        type=int,
                        help='filter sizes, only for CNN')
    parser.add_argument('--dropout_keep_prob',
                        type=float,
                        default=0.8,
                        help='dropout, only for CNN')
    parser.add_argument('--bert_config_file',
                        type=str,
                        default=None,
                        help='pretrained bert config file')
    parser.add_argument('--bert_init_chk',
                        type=str,
                        default=None,
                        help='checkpoint for pretrained Bert')

    # adversarial attack and defence
    parser.add_argument('--adv',
                        action='store_true',
                        help="Perform adversarial attack training/testing")
    parser.add_argument('--cls_enc_type', type=str, default='bi', help="")
    parser.add_argument('--cls_enc_num_units', type=int, default=256, help="")
    parser.add_argument('--cls_enc_layers', type=int, default=2, help="")
    parser.add_argument('--gumbel_softmax_temporature',
                        type=float,
                        default=0.1,
                        help="")
    parser.add_argument('--load_model_cls',
                        type=str,
                        default=None,
                        help="Path to target classification model")
    parser.add_argument('--load_model_ae',
                        type=str,
                        default=None,
                        help="Path to pretrained AE")
    parser.add_argument('--load_model',
                        type=str,
                        default=None,
                        help="Trained model for testing")
    parser.add_argument('--load_model_pos',
                        type=str,
                        default=None,
                        help="PTN attack model for testing")
    parser.add_argument('--load_model_neg',
                        type=str,
                        default=None,
                        help="NTP attack model for testing")

    # balanced attack
    parser.add_argument('--balance',
                        action='store_true',
                        help="Whether balance between pos/neg attack")
    # label smoothing
    parser.add_argument('--label_beta',
                        type=float,
                        default=None,
                        help='label smoother param, must be > 0.5')
    # use counter-fitted embedding for AE (AE embedding different from CLS embeddings)
    parser.add_argument('--ae_vocab_file',
                        type=str,
                        default=None,
                        help='Path to counter-fitted vocabulary')
    parser.add_argument('--ae_emb_file',
                        type=str,
                        default=None,
                        help='Path to counter-fitted embeddings')
    # gan auxiliary loss
    parser.add_argument('--gan',
                        action='store_true',
                        help='Whether use GAN as regularization')
    # conditional generation (1 or 0)
    parser.add_argument(
        '--target_label',
        type=int,
        default=None,
        help="Target label for conditional generation, 0 (PTN) or 1 (NTP)")
    # include defending
    parser.add_argument(
        '--defending',
        action='store_true',
        help="whether train C* for more robust classification models")
    # train defending classifier with augmented dataset
    parser.add_argument(
        '--def_train_set',
        nargs='+',
        default=[],
        type=str,
        help='Set of adversarial examples to include in adv training')
    # attack an AE model using the augmented classifier as the target classifier
    parser.add_argument(
        '--use_defending_as_target',
        action='store_true',
        help='Use the defending component as the target classifier')

    # loss control
    parser.add_argument('--at_steps',
                        type=int,
                        default=1,
                        help='Alternative steps for GAN/Defending')
    parser.add_argument('--ae_lambda',
                        type=float,
                        default=0.8,
                        help='weighting ae_loss+sent_loss v.s. adv_loss')
    parser.add_argument('--seq_lambda',
                        type=float,
                        default=1.0,
                        help='weighting ae_loss v.s. sent_loss')
    parser.add_argument('--aux_lambda',
                        type=float,
                        default=1.0,
                        help='weighting ae_loss v.s. auxiliary losses')
    parser.add_argument('--sentiment_emb_dist',
                        type=str,
                        default='avgcos',
                        help="whether involve embedding distance as aux loss")
    parser.add_argument('--loss_attention',
                        action='store_true',
                        help="whether weight emb dist")
    parser.add_argument('--loss_attention_norm',
                        action='store_true',
                        help="whether apply minimax norm to ae_loss_attention")

    # copy mechanism
    parser.add_argument('--copy',
                        action='store_true',
                        help="Whether use copy mechanism")
    parser.add_argument('--attention_copy_mask',
                        action='store_true',
                        help="Whether use attention to calculate copy mask")
    parser.add_argument('--use_stop_words',
                        action='store_true',
                        help="whether mask stop words")
    parser.add_argument(
        '--top_k_attack',
        type=int,
        default=None,
        help=
        "number of words to attack in copy mechanism, only set when args.copy is set to true."
    )
    parser.add_argument(
        '--load_copy_model',
        type=str,
        default=None,
        help="Pretrained attention layer from the bi_att model")

    # evaluation options
    parser.add_argument('--use_cache_dir',
                        type=str,
                        default=None,
                        help='cache dir for use (sem) eval')
    parser.add_argument(
        '--accept_name',
        type=str,
        default=None,
        help="model name for acceptibility scores (xlnet), only used when set")

    args = parser.parse_args()
    if args.save_checkpoints and not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    vocab_size, vocab_file = input_data.check_vocab(
        args.vocab_file,
        args.output_dir,
        check_special_token=False if
        (args.classification_model == 'BERT') else True,
        vocab_base_name='vocab.txt')
    args.vocab_file = vocab_file
    args.vocab_size = vocab_size

    if args.ae_vocab_file is not None:
        ae_vocab_size, ae_vocab_file = input_data.check_vocab(
            args.ae_vocab_file,
            args.output_dir,
            check_special_token=False if
            (args.classification_model == 'BERT') else True,
            vocab_base_name='ae_vocab.txt')
        args.ae_vocab_size = ae_vocab_size
        args.ae_vocab_file = ae_vocab_file

    args.use_model = None
    if args.use_cache_dir is not None:
        args.use_model = USE(args.use_cache_dir)

    if args.accept_name is not None:
        if args.accept_name == 'bert':
            args.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                           do_lower_case=True)
            args.acpt_model = BertForMaskedLM.from_pretrained(
                'bert-base-uncased')
        elif args.accept_name == 'xlnet':
            args.tokenizer = XLNetTokenizer.from_pretrained(
                'xlnet-large-cased')
            args.acpt_model = XLNetLMHeadModel.from_pretrained(
                'xlnet-large-cased')

        args.device = torch.device(
            'cpu') if args.num_gpus == 0 else torch.device('cuda:0')
        args.acpt_model.to(args.device)
        args.acpt_model.eval()

    return args
Exemplo n.º 25
0
print("have keypoint")
model_mask.eval()
model_fast.eval()
model_keypoint.eval()
model_mask.cuda()
model_fast.cuda()
model_keypoint.cuda()
print("Evaled all")
print("GPT2 Time")
tokenizerG = GPT2Tokenizer.from_pretrained("gpt2")
modelG = GPT2LMHeadModel.from_pretrained("gpt2")
modelG.to("cuda")
print("Done")
print("XLNet Time")
tokenizerX = XLNetTokenizer.from_pretrained("xlnet-base-cased")
modelX = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")
print("BigGan Time!")
from pytorch_pretrained_biggan import (
    BigGAN,
    one_hot_from_names,
    truncated_noise_sample,
    convert_to_images,
)

modelBG = BigGAN.from_pretrained("biggan-deep-256")

modelX.to("cuda")
print("All prep complete!")
labels = {
    int(key): value
    for (key, value) in requests.get(
def mlm_accuracy(sentpairs):
  res = [fill_one(s1, s2) for (s1, s2) in sentpairs]
  return sum(res) / len(sentpairs)

for task_name, sents in sent_pairs.items():
  print(task_name, mlm_accuracy(sents))


# ## XLNet needs to be done differently

# In[7]:


model_name = 'xlnet-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = XLNetLMHeadModel.from_pretrained(model_name)


# In[8]:


def fill_one(sent1, sent2):
  toks1 = tokenizer(sent1, add_special_tokens=False)['input_ids']
  toks2 = tokenizer(sent2, add_special_tokens=False)['input_ids']

  masked_toks = []
  masked_ix = None
  dtok1 = None
  dtok2 = None
  for ix in range(len(toks1)):
    if toks1[ix] != toks2[ix]:
Exemplo n.º 27
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name,
                                cache_dir=model_args.cache_dir)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension,
                                data_files=data_files,
                                cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            **config_kwargs)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            **config_kwargs)
    else:
        config = XLNetConfig()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
        "use_fast": model_args.use_fast_tokenizer,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name,
                                                  **tokenizer_kwargs)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, **tokenizer_kwargs)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = XLNetLMHeadModel.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model from scratch")
        model = XLNetLMHeadModel.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [
                line for line in examples["text"]
                if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(examples["text"],
                             padding=padding,
                             truncation=True,
                             max_length=max_seq_length)

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not data_args.overwrite_cache,
        )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name])

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + max_seq_length]
                    for i in range(0, total_length, max_seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = tokenized_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in tokenized_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = tokenized_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))

    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
        plm_probability=data_args.plm_probability,
        max_span_length=data_args.max_span_length,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
        perplexity = math.exp(metrics["eval_loss"])
        metrics["perplexity"] = perplexity

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.push_to_hub:
        kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tags": "language-modeling"
        }
        if data_args.dataset_name is not None:
            kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                kwargs["dataset_args"] = data_args.dataset_config_name
                kwargs[
                    "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                kwargs["dataset"] = data_args.dataset_name

        trainer.push_to_hub(**kwargs)
Exemplo n.º 28
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (
        os.path.exists(training_args.output_dir)
        and os.listdir(training_args.output_dir)
        and training_args.do_train
        and not training_args.overwrite_output_dir
    ):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
    )

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "fasta":
            FASTA_DATASET = True

            datasets = load_dataset_fasta(data_files, data_args.max_seq_length)
        else:
            if extension == "txt":
                extension = "text"
            datasets = load_dataset(extension, data_files=data_files)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        config = XLNetConfig()
        logger.warning("You are instantiating a new config instance from scratch.")

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    elif model_args.model_name_or_path:
        tokenizer = XLNetTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = XLNetLMHeadModel.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )
    else:
        logger.info("Training new model from scratch")
        model = XLNetLMHeadModel.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    tokenized_datasets = dict()
    for dataset_key, dataset in datasets.items():
        # Tokenize
        encodings = tokenizer(
            dataset['sequences'],
            truncation=True,
            padding='max_length', # TODO get from args passed in
            max_length=data_args.max_seq_length,
            return_special_tokens_mask=True,
            return_token_type_ids=False,
            return_attention_mask=False
        )
        
        torch_dataset = FastaDataset(encodings)
        tokenized_datasets[dataset_key] = torch_dataset


    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
        plm_probability=data_args.plm_probability,
        max_span_length=data_args.max_span_length,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"] if training_args.do_train else None,
        eval_dataset=tokenized_datasets["validation"] if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        model_path = (
            model_args.model_name_or_path
            if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path))
            else None
        )
        trainer.train(model_path=model_path)
        trainer.save_model()  # Saves the tokenizer too for easy upload

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        perplexity = math.exp(eval_output["eval_loss"])
        results["perplexity"] = perplexity

        output_eval_file = os.path.join(training_args.output_dir, "eval_results_plm.txt")
        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in results.items():
                    logger.info(f"  {key} = {value}")
                    writer.write(f"{key} = {value}\n")

    return results
    def test_lm_generate_xlnet_base_cased(self):
        model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")
        model.to(torch_device)
        input_ids = torch.tensor(
            [[
                67,
                2840,
                19,
                18,
                1484,
                20,
                965,
                29077,
                8719,
                1273,
                21,
                45,
                273,
                17,
                10,
                15048,
                28,
                27511,
                21,
                4185,
                11,
                41,
                2444,
                9,
                32,
                1025,
                20,
                8719,
                26,
                23,
                673,
                966,
                19,
                29077,
                20643,
                27511,
                20822,
                20643,
                19,
                17,
                6616,
                17511,
                18,
                8978,
                20,
                18,
                777,
                9,
                19233,
                1527,
                17669,
                19,
                24,
                673,
                17,
                28756,
                150,
                12943,
                4354,
                153,
                27,
                442,
                37,
                45,
                668,
                21,
                24,
                256,
                20,
                416,
                22,
                2771,
                4901,
                9,
                12943,
                4354,
                153,
                51,
                24,
                3004,
                21,
                28142,
                23,
                65,
                20,
                18,
                416,
                34,
                24,
                2958,
                22947,
                9,
                1177,
                45,
                668,
                3097,
                13768,
                23,
                103,
                28,
                441,
                148,
                48,
                20522,
                19,
                12943,
                4354,
                153,
                12860,
                34,
                18,
                326,
                27,
                17492,
                684,
                21,
                6709,
                9,
                8585,
                123,
                266,
                19,
                12943,
                4354,
                153,
                6872,
                24,
                3004,
                20,
                18,
                9225,
                2198,
                19,
                12717,
                103,
                22,
                401,
                24,
                6348,
                9,
                12943,
                4354,
                153,
                1068,
                2768,
                2286,
                19,
                33,
                104,
                19,
                176,
                24,
                9313,
                19,
                20086,
                28,
                45,
                10292,
                9,
                4,
                3,
            ]],
            dtype=torch.long,
            device=torch_device,
        )
        #  In 1991, the remains of Russian Tsar Nicholas II and his family
        #  (except for Alexei and Maria) are discovered.
        #  The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
        #  remainder of the story. 1883 Western Siberia,
        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic.
        #  Rasputin has a vision and denounces one of the men as a horse thief. Although his
        #  father initially slaps him for making such an accusation, Rasputin watches as the
        #  man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
        #  the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
        #  with people, even a bishop, begging for his blessing. """

        expected_output_ids = [
            67,
            2840,
            19,
            18,
            1484,
            20,
            965,
            29077,
            8719,
            1273,
            21,
            45,
            273,
            17,
            10,
            15048,
            28,
            27511,
            21,
            4185,
            11,
            41,
            2444,
            9,
            32,
            1025,
            20,
            8719,
            26,
            23,
            673,
            966,
            19,
            29077,
            20643,
            27511,
            20822,
            20643,
            19,
            17,
            6616,
            17511,
            18,
            8978,
            20,
            18,
            777,
            9,
            19233,
            1527,
            17669,
            19,
            24,
            673,
            17,
            28756,
            150,
            12943,
            4354,
            153,
            27,
            442,
            37,
            45,
            668,
            21,
            24,
            256,
            20,
            416,
            22,
            2771,
            4901,
            9,
            12943,
            4354,
            153,
            51,
            24,
            3004,
            21,
            28142,
            23,
            65,
            20,
            18,
            416,
            34,
            24,
            2958,
            22947,
            9,
            1177,
            45,
            668,
            3097,
            13768,
            23,
            103,
            28,
            441,
            148,
            48,
            20522,
            19,
            12943,
            4354,
            153,
            12860,
            34,
            18,
            326,
            27,
            17492,
            684,
            21,
            6709,
            9,
            8585,
            123,
            266,
            19,
            12943,
            4354,
            153,
            6872,
            24,
            3004,
            20,
            18,
            9225,
            2198,
            19,
            12717,
            103,
            22,
            401,
            24,
            6348,
            9,
            12943,
            4354,
            153,
            1068,
            2768,
            2286,
            19,
            33,
            104,
            19,
            176,
            24,
            9313,
            19,
            20086,
            28,
            45,
            10292,
            9,
            4,
            3,
            19,
            12943,
            4354,
            153,
            27,
            442,
            22,
            2771,
            4901,
            9,
            69,
            27,
            442,
            22,
            2771,
            24,
            11335,
            20,
            18,
            9225,
            2198,
            9,
            69,
            27,
            442,
            22,
            2771,
            24,
            11335,
            20,
            18,
            9225,
            2198,
            9,
            69,
            27,
            442,
            22,
            2771,
        ]
        #  In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria)
        #  are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich,
        #  narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin
        #  is asked by his father and a group of men to perform magic. Rasputin has a vision and
        #  denounces one of the men as a horse thief. Although his father initially slaps
        #  him for making such an accusation, Rasputin watches as the man is chased outside and beaten.
        #  Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest.
        #  Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing.
        #  <sep><cls>, Rasputin is asked to perform magic. He is asked to perform a ritual of the Virgin Mary.
        #  He is asked to perform a ritual of the Virgin Mary. He is asked to perform

        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
Exemplo n.º 30
0
        default=50,
        help='the max length of sentences for training language models.')
    parser.add_argument('--gpu', type=str, default='0')
    parser.add_argument('--dataset',
                        type=str,
                        default='one-billion-words',
                        choices=['yelp', 'amazon', 'one-billion-words'])
    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    forward_model_path = '../checkpoints/forward_xlnet/{}'.format(args.dataset)

    backward_model_path = '../checkpoints/backward_xlnet/{}'.format(
        args.dataset)

    forward_model = XLNetLMHeadModel.from_pretrained(forward_model_path)
    backward_model = XLNetLMHeadModel.from_pretrained(backward_model_path)

    forward_tokenizer = XLNetTokenizer.from_pretrained(forward_model_path)
    backward_tokenizer = XLNetTokenizer.from_pretrained(backward_model_path)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)
    forward_model = forward_model.to(device)
    backward_model = backward_model.to(device)

    forward_testset = XLNetDataset(
        args.dataset,
        "test",
        tokenizer=forward_tokenizer,
        max_sentence_length=args.max_sentence_length,