def main(train_epoch, batch_size, seq_length, lr, corpus_path, vocab_path,
         config_path, pretrain_model_path, output_record_path,
         model_save_path):
    seed_everything(997)
    num_train_epochs = train_epoch
    pretrain_batch_size = batch_size
    seq_length = seq_length
    lr = lr
    corpus_path = corpus_path
    vocab_path = vocab_path
    config_path = config_path
    pretrain_model_path = pretrain_model_path
    output_record_path = output_record_path
    model_save_path = model_save_path

    tokenizer = BertTokenizer.from_pretrained(vocab_path)
    #     train_dataset = LineByLineTextDataset(block_size=128, file_path=corpus_path, tokenizer=tokenizer)

    #     data = read_data(corpus_path, tokenizer)
    train_dataset = OppoDataset(train_file_path=corpus_path,
                                tokenizer=tokenizer,
                                maxlen=128)

    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer)

    config = XLNetConfig.from_pretrained(
        pretrained_model_name_or_path=config_path)
    #     model = XLNetForMaskedLM(config=config,name='./xlnet_model/pytorch_model.bin')
    if os.path.exists(pretrain_model_path):
        model = XLNetLMHeadModel.from_pretrained(pretrain_model_path,
                                                 config=config)
    else:
        model = XLNetLMHeadModel(config=config)


#     data_collator = Collator(max_seq_len=seq_length, tokenizer=tokenizer, mlm_probability=0.15)

    training_args = TrainingArguments(
        output_dir=output_record_path,
        overwrite_output_dir=True,
        num_train_epochs=num_train_epochs,
        learning_rate=lr,
        dataloader_num_workers=8,
        prediction_loss_only=True,
        fp16=True,
        fp16_backend='amp',
        per_device_train_batch_size=pretrain_batch_size,
        save_strategy='no',
        seed=997)

    trainer = Trainer(model=model,
                      args=training_args,
                      data_collator=data_collator,
                      train_dataset=train_dataset)

    trainer.train()
    trainer.save_model(model_save_path)
示例#2
0
        def create_and_check_xlnet_lm_head(self, config, input_ids_1,
                                           input_ids_2, input_ids_q, perm_mask,
                                           input_mask, target_mapping,
                                           segment_ids, lm_labels,
                                           sequence_labels,
                                           is_impossible_labels):
            model = XLNetLMHeadModel(config)
            model.eval()

            loss_1, all_logits_1, mems_1 = model(input_ids_1,
                                                 token_type_ids=segment_ids,
                                                 labels=lm_labels)

            loss_2, all_logits_2, mems_2 = model(input_ids_2,
                                                 token_type_ids=segment_ids,
                                                 labels=lm_labels,
                                                 mems=mems_1)

            logits, _ = model(input_ids_q,
                              perm_mask=perm_mask,
                              target_mapping=target_mapping)

            result = {
                "loss_1": loss_1,
                "mems_1": mems_1,
                "all_logits_1": all_logits_1,
                "loss_2": loss_2,
                "mems_2": mems_2,
                "all_logits_2": all_logits_2,
            }

            self.parent.assertListEqual(list(result["loss_1"].size()), [])
            self.parent.assertListEqual(
                list(result["all_logits_1"].size()),
                [self.batch_size, self.seq_length, self.vocab_size])
            self.parent.assertListEqual(
                list(list(mem.size()) for mem in result["mems_1"]),
                [[self.seq_length, self.batch_size, self.hidden_size]] *
                self.num_hidden_layers)

            self.parent.assertListEqual(list(result["loss_2"].size()), [])
            self.parent.assertListEqual(
                list(result["all_logits_2"].size()),
                [self.batch_size, self.seq_length, self.vocab_size])
            self.parent.assertListEqual(
                list(list(mem.size()) for mem in result["mems_2"]),
                [[self.mem_len, self.batch_size, self.hidden_size]] *
                self.num_hidden_layers)
示例#3
0
文件: xlnet.py 项目: JBoRu/TextBox-1
    def __init__(self, config, dataset):
        super(XLNet, self).__init__(config, dataset)

        self.eval_generate_num = config['eval_generate_num']

        self.tokenizer = XLNetTokenizer.from_pretrained(
            'xlnet-base-cased',
            bos_token=dataset.sos_token,
            eos_token=dataset.eos_token,
            pad_token=dataset.padding_token,
            unk_token=dataset.eos_token)

        self.configuration = XLNetConfig.from_pretrained('xlnet-base-cased')

        self.decoder = XLNetLMHeadModel.from_pretrained(
            'xlnet-base-cased', config=self.configuration)
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.sos_token = dataset.sos_token
        self.eos_token = dataset.eos_token
        self.mask_token = '<mask>'
        self.padding_token_idx = self.tokenizer.pad_token_id
        self.max_seq_length = config['max_seq_length']
        self.device = config["device"]

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
示例#4
0
文件: xlnet.py 项目: yunti123/nlpaug
    def __init__(self,
                 model_path='xlnet-base-cased',
                 temperature=1.0,
                 top_k=None,
                 top_p=None,
                 padding_text=None,
                 optimize=None,
                 device=None):
        super().__init__(device,
                         temperature=temperature,
                         top_k=top_k,
                         top_p=top_p,
                         optimize=optimize)
        try:
            import transformers
        except ModuleNotFoundError:
            raise ModuleNotFoundError(
                'Missed transformers library. Install transfomers by `pip install transformers`'
            )

        self.model_path = model_path

        # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        # self.model = AutoModel.from_pretrained(model_path)
        # TODO: Evaluted to use mems in XLNet but the result is quite weird.
        self.optimize['external_memory'] = 0
        self.tokenizer = XLNetTokenizer.from_pretrained(model_path)
        self.model = XLNetLMHeadModel.from_pretrained(
            model_path, mem_len=self.optimize['external_memory'])

        self.padding_text_idxes = self.tokenizer.encode(padding_text
                                                        or self.PADDING_TEXT)

        self.model.to(self.device)
        self.model.eval()
示例#5
0
 def register_model(self) -> NoReturn:
     """
     If the model is not registered this method creates that model and
     places it to the model register. If the model is registered just
     increments model reference count. This method helps to save computational resources
     e.g. when combining model prediction with embedding similarity by not loading into
     memory same model twice.
     """
     if self.model_name not in XLNetProbEstimator.loaded:
         model = XLNetLMHeadModel.from_pretrained(self.model_name)
         model.to(self.device)
         model.eval()
         tokenizer = XLNetTokenizer.from_pretrained(self.model_name)
         word2id = self._get_word2id(tokenizer)
         spiece_ids = [
             idx for word, idx in word2id.items()
             if word.startswith(self.NON_START_SYMBOL)
         ]
         all_special_ids = tokenizer.all_special_ids
         word_embeddings = model.transformer.word_embedding.weight.data.cpu(
         ).numpy()
         XLNetProbEstimator.loaded[self.model_name] = {
             "model": model,
             "tokenizer": tokenizer,
             "embeddings": word_embeddings,
             "word2id": word2id,
             "spiece_ids": spiece_ids,
             "all_special_ids": all_special_ids,
         }
         XLNetProbEstimator.loaded[self.model_name]["ref_count"] = 1
     else:
         XLNetProbEstimator.loaded[self.model_name]["ref_count"] += 1
示例#6
0
def config():
    parser = ArgumentParser()
    # basic
    parser.add_argument('--file_dir',
                        type=str,
                        default=None,
                        help="data directory")
    parser.add_argument('--ids_file',
                        type=str,
                        default=None,
                        help="list of ids to eval")
    parser.add_argument('--id',
                        type=str,
                        default=None,
                        help="single setting to evaluate")
    parser.add_argument('--parsed_file', type=str, default=None, help='')
    parser.add_argument('--accept_name',
                        type=str,
                        default='xlnet',
                        help='bert or xlnet')

    args = parser.parse_args()

    model_name = 'xlnet-large-cased'
    args.tokenizer = XLNetTokenizer.from_pretrained(model_name)
    args.acpt_model = XLNetLMHeadModel.from_pretrained(model_name)

    args.device = torch.device('cuda:0')
    args.acpt_model.to(args.device)
    args.acpt_model.eval()

    return args
示例#7
0
    def __init__(self, args):
        super().__init__()

        self.load_model = args.load_model

        if "xlnet" in args.load_model:
            self.tokenizer = AutoTokenizer.from_pretrained(self.load_model)
            self.model = XLNetLMHeadModel.from_pretrained(self.load_model,
                                                          mem_len=1024).to(
                                                              args.device)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(self.load_model)
            config = AutoConfig.from_pretrained(self.load_model)
            config.output_hidden_states = True
            self.model = AutoModelWithLMHead.from_pretrained(self.load_model,
                                                             config=config).to(
                                                                 args.device)

        hidden_size = 1024 if "large" in self.load_model or self.load_model == "gpt2-medium" else 768

        self.hidden2label = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2), nn.Sigmoid(),
            nn.Linear(hidden_size // 2, 2)).to(args.device)

        # self.hidden2label = nn.Linear(hidden_size, 2).to(args.device)
        self.dropout = torch.nn.Dropout(args.dropout)
        self.layer = args.bert_layer

        self.eval()
        self.device = args.device
        self.args = args
示例#8
0
    def __init__(self,
                 model_path='xlnet-base-cased',
                 temperature=1.0,
                 top_k=None,
                 top_p=None,
                 padding_text=None,
                 optimize=None,
                 device=None):
        super().__init__(device,
                         temperature=temperature,
                         top_k=top_k,
                         top_p=top_p,
                         optimize=optimize)
        self.model_path = model_path

        self.tokenizer = XLNetTokenizer.from_pretrained(model_path)
        # TODO: Evaluted to use mems in XLNet but the result is quite weird.
        self.optimize['external_memory'] = 0
        self.model = XLNetLMHeadModel.from_pretrained(
            model_path, mem_len=self.optimize['external_memory'])

        self.padding_text_idxes = self.tokenizer.encode(padding_text
                                                        or self.PADDING_TEXT)

        self.model.to(self.device)
        self.model.eval()
示例#9
0
def convert_xlnet_checkpoint_to_pytorch(
    tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None
):
    # Initialise PyTorch model
    config = XLNetConfig.from_json_file(bert_config_file)

    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
    if finetuning_task in GLUE_TASKS_NUM_LABELS:
        print(f"Building PyTorch XLNetForSequenceClassification model from configuration: {config}")
        config.finetuning_task = finetuning_task
        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
        model = XLNetForSequenceClassification(config)
    elif "squad" in finetuning_task:
        config.finetuning_task = finetuning_task
        model = XLNetForQuestionAnswering(config)
    else:
        model = XLNetLMHeadModel(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)

    # Save pytorch-model
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
    print(f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}")
    torch.save(model.state_dict(), pytorch_weights_dump_path)
    print(f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}")
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(config.to_json_string())
示例#10
0
    def __init__(self, config, dataset):
        super(XLNet, self).__init__(config, dataset)

        self.pretrained_model_path = config['pretrained_model_path']
        self.tokenizer = XLNetTokenizer.from_pretrained(
            self.pretrained_model_path,
            bos_token=dataset.sos_token,
            eos_token=dataset.eos_token,
            pad_token=dataset.padding_token)

        self.sos_token = self.tokenizer.bos_token
        self.eos_token = self.tokenizer.eos_token
        self.sos_token_idx = self.tokenizer.bos_token_id
        self.eos_token_idx = self.tokenizer.eos_token_id
        self.padding_token_idx = self.tokenizer.pad_token_id

        self.configuration = XLNetConfig.from_pretrained(
            self.pretrained_model_path,
            bos_token_id=self.sos_token_idx,
            eos_token_id=self.eos_token_idx,
            pad_token_id=self.padding_token_idx)

        self.decoder = XLNetLMHeadModel.from_pretrained(
            self.pretrained_model_path, config=self.configuration)
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
def main(raw_args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name",
                        type=str,
                        required=True,
                        help="model name e.g. xlnet-tiny-chinese")
    parser.add_argument("--cache_dir",
                        type=str,
                        default=None,
                        required=False,
                        help="Directory containing pytorch model")
    parser.add_argument("--pytorch_model_path",
                        type=str,
                        required=True,
                        help="/path/to/<pytorch-model-name>.bin")
    parser.add_argument("--tf_cache_dir",
                        type=str,
                        required=True,
                        help="Directory in which to save tensorflow model")
    args = parser.parse_args(raw_args)

    # model = XLNetLMHeadModel.from_pretrained(
    #     pretrained_model_name_or_path=args.model_name,
    #     state_dict=torch.load(args.pytorch_model_path),
    #     cache_dir=args.cache_dir
    # )
    model = XLNetLMHeadModel.from_pretrained(
        pretrained_model_name_or_path=args.cache_dir)

    convert_pytorch_checkpoint_to_tf(model=model,
                                     ckpt_dir=args.tf_cache_dir,
                                     model_name=args.model_name)
示例#12
0
    def __init__(self,
                 vocab: Vocabulary,
                 model_name: str = "bert-base",
                 multi_choice: bool = False):
        super().__init__(vocab)
        self._model = None
        self._loss = CrossEntropyLoss()
        self.is_multi_choice = multi_choice

        if model_name.startswith('bert'):
            if self.is_multi_choice:
                self._model = BertMultiChoiceMLM.from_pretrained(model_name)
            else:
                self._model = BertForMaskedLM.from_pretrained(model_name)
        elif 'roberta' in model_name:
            if self.is_multi_choice:
                self._model = RobertaMultiChoiceMLM.from_pretrained(model_name)
            else:
                self._model = RobertaForMaskedLM.from_pretrained(model_name)

        elif 'albert' in model_name:
            self._model = AlbertForMaskedLM.from_pretrained(model_name)
        elif 'xlnet' in model_name:
            self._model = XLNetLMHeadModel.from_pretrained(model_name)
        else:
            raise ("Riquiered model is not supported.")
示例#13
0
def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path):
    # Initialise PyTorch model
    config = XLNetConfig.from_json_file(bert_config_file)

    model = XLNetLMHeadModel(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)

    # Save pytorch-model
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
    print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
    torch.save(model.state_dict(), pytorch_weights_dump_path)
    print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(config.to_json_string())
    def __init__(self):
        cmd = 'echo `mecab-config --dicdir`"/mecab-ipadic-neologd"'
        path = (subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 shell=True).communicate()[0]).decode('utf-8')
        self.m = MeCab.Tagger(f"-Owakati -d {path}")
        logger.info("mecab loaded")

        self.model_dir = "hajime9652/xlnet-japanese"
        # self.model_dir = "./backend/PyTorch"

        self.gen_model = XLNetLMHeadModel.from_pretrained(self.model_dir)
        self.gen_tokenizer = XLNetTokenizer.from_pretrained(self.model_dir)
示例#15
0
    def test_embedding_lm(self):
        # try original model
        lmmodel = XLNetLMHeadModel.from_pretrained('xlnet-base-cased')
        lm_outputs = lmmodel(self.input)
        last_hidden_states_lm = lm_outputs[
            0]  # The last hidden-state is the first element of the output tuple

        # try our version
        embed_outs_lm = self.embed_model.lm(self.input)
        last_embedding_lm = embed_outs_lm[0]
        assert torch.all(
            torch.eq(last_embedding_lm,
                     last_hidden_states_lm)), "LM embeddings were not the same"
示例#16
0
    def __init__(self, model_path='xlnet-base-cased', temperature=1.0, top_k=None, top_p=None, padding_text=None,
                 device=None, return_past=False):
        super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
        self.model_path = model_path

        self.tokenizer = XLNetTokenizer.from_pretrained(model_path)
        self.model = XLNetLMHeadModel.from_pretrained(model_path)

        self.padding_text_idxes = self.tokenizer.encode(padding_text or self.PADDING_TEXT)

        self.model.to(self.device)
        self.model.eval()

        self.return_past = return_past
示例#17
0
    def create_and_check_xlnet_lm_head(
        self,
        config,
        input_ids_1,
        input_ids_2,
        input_ids_q,
        perm_mask,
        input_mask,
        target_mapping,
        segment_ids,
        lm_labels,
        sequence_labels,
        is_impossible_labels,
        token_labels,
    ):
        model = XLNetLMHeadModel(config)
        model.to(torch_device)
        model.eval()

        result1 = model(input_ids_1,
                        token_type_ids=segment_ids,
                        labels=lm_labels)

        result2 = model(input_ids_2,
                        token_type_ids=segment_ids,
                        labels=lm_labels,
                        mems=result1["mems"])

        _ = model(input_ids_q,
                  perm_mask=perm_mask,
                  target_mapping=target_mapping)

        self.parent.assertListEqual(list(result1["loss"].size()), [])
        self.parent.assertListEqual(
            list(result1["logits"].size()),
            [self.batch_size, self.seq_length, self.vocab_size],
        )
        self.parent.assertListEqual(
            list(list(mem.size()) for mem in result1["mems"]),
            [[self.seq_length, self.batch_size, self.hidden_size]] *
            self.num_hidden_layers,
        )

        self.parent.assertListEqual(list(result2["loss"].size()), [])
        self.parent.assertListEqual(
            list(result2["logits"].size()),
            [self.batch_size, self.seq_length, self.vocab_size],
        )
        self.parent.assertListEqual(
            list(list(mem.size()) for mem in result2["mems"]),
            [[self.mem_len, self.batch_size, self.hidden_size]] *
            self.num_hidden_layers,
        )
def run_mlm_mask_accuracy(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # These do the same thing, except XLNet is a less popular model so it's not supported by
    # all AutoModel variants.
    if 'xlnet' in model_name:
        model = XLNetLMHeadModel.from_pretrained(model_name)
    else:
        model = AutoModelForMaskedLM.from_pretrained(model_name)

    # Make binary choice for a single sentence pair
    def mlm_sentence_pair(sent1, sent2):
        masked_toks, masked_ix, dtok1, dtok2 = get_masked_sequence(
            tokenizer, sent1, sent2)
        logit1 = model(torch.tensor([masked_toks])).logits[0, masked_ix, dtok1]
        logit2 = model(torch.tensor([masked_toks])).logits[0, masked_ix, dtok2]
        return bool(logit1 > logit2)

    sent_pairs = get_common_sentences()
    for task_name, sents in sent_pairs.items():
        res = [mlm_sentence_pair(s1, s2) for (s1, s2) in sents]
        acc = sum(res) / len(sents)
        print(task_name, acc)
示例#19
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
    logger.setLevel(logging.INFO if is_main_process(training_args.local_rank
                                                    ) else logging.WARN)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
    if is_main_process(training_args.local_rank):
        transformers.utils.logging.set_verbosity_info()
        transformers.utils.logging.enable_default_handler()
        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    #
    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(data_args.dataset_name,
                                data_args.dataset_config_name,
                                cache_dir=model_args.cache_dir)
        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension,
                                data_files=data_files,
                                cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.
    config_kwargs = {
        "cache_dir": model_args.cache_dir,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.config_name:
        config = AutoConfig.from_pretrained(model_args.config_name,
                                            **config_kwargs)
    elif model_args.model_name_or_path:
        config = AutoConfig.from_pretrained(model_args.model_name_or_path,
                                            **config_kwargs)
    else:
        config = XLNetConfig()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
        "use_fast": model_args.use_fast_tokenizer,
        "revision": model_args.model_revision,
        "use_auth_token": True if model_args.use_auth_token else None,
    }
    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name,
                                                  **tokenizer_kwargs)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, **tokenizer_kwargs)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.model_name_or_path:
        model = XLNetLMHeadModel.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
            revision=model_args.model_revision,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        logger.info("Training new model from scratch")
        model = XLNetLMHeadModel.from_config(config)

    model.resize_token_embeddings(len(tokenizer))

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length > tokenizer.model_max_length:
        logger.warning(
            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    if data_args.line_by_line:
        # When using line_by_line, we just tokenize each nonempty line.
        padding = "max_length" if data_args.pad_to_max_length else False

        def tokenize_function(examples):
            # Remove empty lines
            examples["text"] = [
                line for line in examples["text"]
                if len(line) > 0 and not line.isspace()
            ]
            return tokenizer(examples["text"],
                             padding=padding,
                             truncation=True,
                             max_length=max_seq_length)

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=[text_column_name],
            load_from_cache_file=not data_args.overwrite_cache,
        )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name])

        tokenized_datasets = datasets.map(
            tokenize_function,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            remove_columns=column_names,
            load_from_cache_file=not data_args.overwrite_cache,
        )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
            concatenated_examples = {
                k: sum(examples[k], [])
                for k in examples.keys()
            }
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
            total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [
                    t[i:i + max_seq_length]
                    for i in range(0, total_length, max_seq_length)
                ]
                for k, t in concatenated_examples.items()
            }
            return result

        # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
        # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
        # might be slower to preprocess.
        #
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

        tokenized_datasets = tokenized_datasets.map(
            group_texts,
            batched=True,
            num_proc=data_args.preprocessing_num_workers,
            load_from_cache_file=not data_args.overwrite_cache,
        )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
            raise ValueError("--do_train requires a train dataset")
        train_dataset = tokenized_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(
                range(data_args.max_train_samples))

    if training_args.do_eval:
        if "validation" not in tokenized_datasets:
            raise ValueError("--do_eval requires a validation dataset")
        eval_dataset = tokenized_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(
                range(data_args.max_eval_samples))

    # Data collator
    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer,
        plm_probability=data_args.plm_probability,
        max_span_length=data_args.max_span_length,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        trainer.save_model()  # Saves the tokenizer too for easy upload
        metrics = train_result.metrics

        max_train_samples = (data_args.max_train_samples
                             if data_args.max_train_samples is not None else
                             len(train_dataset))
        metrics["train_samples"] = min(max_train_samples, len(train_dataset))

        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()

    # Evaluation
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        metrics = trainer.evaluate()

        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
            eval_dataset)
        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
        perplexity = math.exp(metrics["eval_loss"])
        metrics["perplexity"] = perplexity

        trainer.log_metrics("eval", metrics)
        trainer.save_metrics("eval", metrics)

    if training_args.push_to_hub:
        kwargs = {
            "finetuned_from": model_args.model_name_or_path,
            "tags": "language-modeling"
        }
        if data_args.dataset_name is not None:
            kwargs["dataset_tags"] = data_args.dataset_name
            if data_args.dataset_config_name is not None:
                kwargs["dataset_args"] = data_args.dataset_config_name
                kwargs[
                    "dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
            else:
                kwargs["dataset"] = data_args.dataset_name

        trainer.push_to_hub(**kwargs)
示例#20
0
        default=50,
        help='the max length of sentences for training language models.')
    parser.add_argument('--gpu', type=str, default='0')
    parser.add_argument('--dataset',
                        type=str,
                        default='one-billion-words',
                        choices=['yelp', 'amazon', 'one-billion-words'])
    args = parser.parse_args()
    os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu

    forward_model_path = '../checkpoints/forward_xlnet/{}'.format(args.dataset)

    backward_model_path = '../checkpoints/backward_xlnet/{}'.format(
        args.dataset)

    forward_model = XLNetLMHeadModel.from_pretrained(forward_model_path)
    backward_model = XLNetLMHeadModel.from_pretrained(backward_model_path)

    forward_tokenizer = XLNetTokenizer.from_pretrained(forward_model_path)
    backward_tokenizer = XLNetTokenizer.from_pretrained(backward_model_path)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print("device:", device)
    forward_model = forward_model.to(device)
    backward_model = backward_model.to(device)

    forward_testset = XLNetDataset(
        args.dataset,
        "test",
        tokenizer=forward_tokenizer,
        max_sentence_length=args.max_sentence_length,
示例#21
0
文件: main.py 项目: NLPCode/MCMCXLNet
        masked_lm = None
        mode = 0
    elif args.model_name =='XLNetLMGenerate':

        forward_lm_path = '../checkpoints/forward_xlnet/{}'.format(args.dataset)
        args.forward_lm_path = forward_lm_path

        backward_lm_path = '../checkpoints/backward_xlnet/{}'.format(args.dataset)
        args.backward_lm_path = backward_lm_path

        masked_lm_path = '../checkpoints/xlnet_maskedlm/{}'.format(args.dataset)
        args.masked_lm_path = masked_lm_path

        forward_lm_tokenizer = XLNetTokenizer.from_pretrained(forward_lm_path)
        forward_lm = XLNetLMHeadModel.from_pretrained(forward_lm_path)
        logger.logger.info('Initialize forward XLNet LM from checkpoint {}.'.format(forward_lm_path))

        # backward_lm_tokenizer = XLNetTokenizer.from_pretrained(backward_lm_path)
        backward_lm = XLNetLMHeadModel.from_pretrained(backward_lm_path)
        logger.logger.info('Initialize backward XLNet LM from checkpoint {}.'.format(backward_lm_path))

        if args.generate_candidate_method==3:
            masked_lm =XLNetLMHeadModel.from_pretrained(masked_lm_path)
            logger.logger.info('Initialize masked XLNet LM from checkpoint {}.'.format(masked_lm_path))
        else:
            masked_lm = None
        mode = 1
    else:
        raise ValueError('wrong model type.')
示例#22
0
 def _get_masked_language_model(self):
     """
     Initializes the XLNetLMHeadModel transformer
     """
     self.mlm = XLNetLMHeadModel.from_pretrained(self.model)
     self.mlm.eval()
def convert_pytorch_checkpoint_to_tf(model: XLNetLMHeadModel, ckpt_dir: str,
                                     model_name: str):
    """
    :param model:XLNetLMHeadModel Pytorch model instance to be converted
    :param ckpt_dir: Tensorflow model directory
    :param model_name: model name
    :return:

    Currently supported HF models:
        XLNetLMHeadModel
    """

    tensors_to_transpose = ("dense.weight", "attention.self.query",
                            "attention.self.key", "attention.self.value")

    var_map = (('layer.', 'layer_'), ('word_embeddings.weight',
                                      'word_embeddings'),
               ('position_embeddings.weight', 'position_embeddings'),
               ('token_type_embeddings.weight', 'token_type_embeddings'),
               ('.', '/'), ('LayerNorm/weight', 'LayerNorm/gamma'),
               ('LayerNorm/bias', 'LayerNorm/beta'), ('weight', 'kernel'))

    if not os.path.isdir(ckpt_dir):
        os.makedirs(ckpt_dir)

    state_dict = model.state_dict()

    def to_tf_var_name(name: str):
        for patt, repl in iter(var_map):
            name = name.replace(patt, repl)
        return 'xlnet/{}'.format(name)

    def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session):
        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
        tf_var = tf.get_variable(dtype=tf_dtype,
                                 shape=tensor.shape,
                                 name=name,
                                 initializer=tf.zeros_initializer())
        session.run(tf.variables_initializer([tf_var]))
        session.run(tf_var)
        return tf_var

    tf.reset_default_graph()
    with tf.Session() as session:
        for var_name in state_dict:
            tf_name = to_tf_var_name(var_name)
            torch_tensor = state_dict[var_name].numpy()
            if any([x in var_name for x in tensors_to_transpose]):
                torch_tensor = torch_tensor.T
            tf_var = create_tf_var(tensor=torch_tensor,
                                   name=tf_name,
                                   session=session)
            tf.keras.backend.set_value(tf_var, torch_tensor)
            tf_weight = session.run(tf_var)
            print("Successfully created {}: {}".format(
                tf_name, np.allclose(tf_weight, torch_tensor)))

        saver = tf.train.Saver(tf.trainable_variables())
        saver.save(
            session,
            os.path.join(ckpt_dir,
                         model_name.replace("-", "_") + ".ckpt"))
示例#24
0
import torch
from transformers import XLNetTokenizer, XLNetLMHeadModel

import logging
logging.basicConfig(level=logging.INFO)

tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
# We show how to setup inputs to predict a next token using a bi-directional context.
input_ids = torch.tensor(
    tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(
        0)  # We will predict the masked token
perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]),
                        dtype=torch.float)
perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
target_mapping = torch.zeros(
    (1, 1, input_ids.shape[1]),
    dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
target_mapping[
    0, 0,
    -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
next_token_logits = outputs[
    0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]

print(next_token_logits)
print(next_token_logits.shape)
predicted_index = torch.argmax(next_token_logits).item()
print(predicted_index)
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
print(predicted_token)
示例#25
0
print("have keypoint")
model_mask.eval()
model_fast.eval()
model_keypoint.eval()
model_mask.cuda()
model_fast.cuda()
model_keypoint.cuda()
print("Evaled all")
print("GPT2 Time")
tokenizerG = GPT2Tokenizer.from_pretrained("gpt2")
modelG = GPT2LMHeadModel.from_pretrained("gpt2")
modelG.to("cuda")
print("Done")
print("XLNet Time")
tokenizerX = XLNetTokenizer.from_pretrained("xlnet-base-cased")
modelX = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")
print("BigGan Time!")
from pytorch_pretrained_biggan import (
    BigGAN,
    one_hot_from_names,
    truncated_noise_sample,
    convert_to_images,
)

modelBG = BigGAN.from_pretrained("biggan-deep-256")

modelX.to("cuda")
print("All prep complete!")
labels = {
    int(key): value
    for (key, value) in requests.get(
示例#26
0
# %%
from transformers import RobertaTokenizer, RobertaForMaskedLM
from transformers import ElectraTokenizer, ElectraForMaskedLM
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import XLNetTokenizer, XLNetLMHeadModel
import torch
import string

from transformers import BertTokenizer, BertForMaskedLM

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForMaskedLM.from_pretrained('bert-base-uncased').eval()

xlnet_tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
xlnet_model = XLNetLMHeadModel.from_pretrained('xlnet-base-cased').eval()

xlmroberta_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
xlmroberta_model = XLMRobertaForMaskedLM.from_pretrained(
    'xlm-roberta-base').eval()

bart_tokenizer = BartTokenizer.from_pretrained('bart-large')
bart_model = BartForConditionalGeneration.from_pretrained('bart-large').eval()

electra_tokenizer = ElectraTokenizer.from_pretrained(
    'google/electra-small-generator')
electra_model = ElectraForMaskedLM.from_pretrained(
    'google/electra-small-generator').eval()

roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = RobertaForMaskedLM.from_pretrained('roberta-base').eval()
示例#27
0
        sentence_best_word_probs.append(best_word_prob)
        best_words.append(
            model_tokenizer.convert_ids_to_tokens(
                predicted_prob.argmax().item()))

    return (sentence_word_probs, sentence_best_word_probs, best_words)


######################################################
### Compute XLNet scores
######################################################

for XLNET_MODEL in tqdm(['xlnet-base-cased', 'xlnet-large-cased']):

    model_tokenizer = XLNetTokenizer.from_pretrained(XLNET_MODEL)
    model = XLNetLMHeadModel.from_pretrained(XLNET_MODEL)

    if torch.cuda.is_available():
        model = model.cuda()

    model = model.eval()

    for dial in tqdm(itertools.chain(convai1_data, convai2_data),
                     total=convai_data_len):
        utterances = dial['utterances']

        sentences_word_probs = list()
        sentences_best_word_probs = list()
        sentences_best_words = list()

        for u1, u2 in zip(utterances[:-1], utterances[1:]):
示例#28
0
    parser.add_argument('--data_start', type=float, default=0, help='start point of data in 0-1 for DUC or TAC')
    parser.add_argument('--data_end', type=float, default=1, help='end point of data in 0-1 for DUC or TAC')

    parser.add_argument('--save_freq', type=int, default=1)
    parser.add_argument('--debug', action='store_true')

    args = parser.parse_args()
    return args


if __name__ == '__main__':
    args = parse_args()

    # XLNet models
    tokenizer = XLNetTokenizer.from_pretrained(args.xlnet_model)
    model = XLNetLMHeadModel.from_pretrained(args.xlnet_model)
    if args.gpu_parallel:
        model = nn.DataParallel(model).cuda()
    else:
        cuda_dev = torch.device('cuda:{}'.format(args.gpu_id))
        model = model.cuda(cuda_dev)
    model.train(False)

    # spaCy: used for merge noun chunks & name entities
    spacy.prefer_gpu()
    nlp = spacy.load(args.spacy_model)

    def merge_entities_and_nouns(doc, ret=True):
        assert doc.is_parsed
        with doc.retokenize() as retokenizer:
            seen_words = set()
    def test_lm_generate_xlnet_base_cased(self):
        model = XLNetLMHeadModel.from_pretrained("xlnet-base-cased")
        model.to(torch_device)
        input_ids = torch.tensor(
            [[
                67,
                2840,
                19,
                18,
                1484,
                20,
                965,
                29077,
                8719,
                1273,
                21,
                45,
                273,
                17,
                10,
                15048,
                28,
                27511,
                21,
                4185,
                11,
                41,
                2444,
                9,
                32,
                1025,
                20,
                8719,
                26,
                23,
                673,
                966,
                19,
                29077,
                20643,
                27511,
                20822,
                20643,
                19,
                17,
                6616,
                17511,
                18,
                8978,
                20,
                18,
                777,
                9,
                19233,
                1527,
                17669,
                19,
                24,
                673,
                17,
                28756,
                150,
                12943,
                4354,
                153,
                27,
                442,
                37,
                45,
                668,
                21,
                24,
                256,
                20,
                416,
                22,
                2771,
                4901,
                9,
                12943,
                4354,
                153,
                51,
                24,
                3004,
                21,
                28142,
                23,
                65,
                20,
                18,
                416,
                34,
                24,
                2958,
                22947,
                9,
                1177,
                45,
                668,
                3097,
                13768,
                23,
                103,
                28,
                441,
                148,
                48,
                20522,
                19,
                12943,
                4354,
                153,
                12860,
                34,
                18,
                326,
                27,
                17492,
                684,
                21,
                6709,
                9,
                8585,
                123,
                266,
                19,
                12943,
                4354,
                153,
                6872,
                24,
                3004,
                20,
                18,
                9225,
                2198,
                19,
                12717,
                103,
                22,
                401,
                24,
                6348,
                9,
                12943,
                4354,
                153,
                1068,
                2768,
                2286,
                19,
                33,
                104,
                19,
                176,
                24,
                9313,
                19,
                20086,
                28,
                45,
                10292,
                9,
                4,
                3,
            ]],
            dtype=torch.long,
            device=torch_device,
        )
        #  In 1991, the remains of Russian Tsar Nicholas II and his family
        #  (except for Alexei and Maria) are discovered.
        #  The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
        #  remainder of the story. 1883 Western Siberia,
        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic.
        #  Rasputin has a vision and denounces one of the men as a horse thief. Although his
        #  father initially slaps him for making such an accusation, Rasputin watches as the
        #  man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
        #  the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
        #  with people, even a bishop, begging for his blessing. """

        expected_output_ids = [
            67,
            2840,
            19,
            18,
            1484,
            20,
            965,
            29077,
            8719,
            1273,
            21,
            45,
            273,
            17,
            10,
            15048,
            28,
            27511,
            21,
            4185,
            11,
            41,
            2444,
            9,
            32,
            1025,
            20,
            8719,
            26,
            23,
            673,
            966,
            19,
            29077,
            20643,
            27511,
            20822,
            20643,
            19,
            17,
            6616,
            17511,
            18,
            8978,
            20,
            18,
            777,
            9,
            19233,
            1527,
            17669,
            19,
            24,
            673,
            17,
            28756,
            150,
            12943,
            4354,
            153,
            27,
            442,
            37,
            45,
            668,
            21,
            24,
            256,
            20,
            416,
            22,
            2771,
            4901,
            9,
            12943,
            4354,
            153,
            51,
            24,
            3004,
            21,
            28142,
            23,
            65,
            20,
            18,
            416,
            34,
            24,
            2958,
            22947,
            9,
            1177,
            45,
            668,
            3097,
            13768,
            23,
            103,
            28,
            441,
            148,
            48,
            20522,
            19,
            12943,
            4354,
            153,
            12860,
            34,
            18,
            326,
            27,
            17492,
            684,
            21,
            6709,
            9,
            8585,
            123,
            266,
            19,
            12943,
            4354,
            153,
            6872,
            24,
            3004,
            20,
            18,
            9225,
            2198,
            19,
            12717,
            103,
            22,
            401,
            24,
            6348,
            9,
            12943,
            4354,
            153,
            1068,
            2768,
            2286,
            19,
            33,
            104,
            19,
            176,
            24,
            9313,
            19,
            20086,
            28,
            45,
            10292,
            9,
            4,
            3,
            19,
            12943,
            4354,
            153,
            27,
            442,
            22,
            2771,
            4901,
            9,
            69,
            27,
            442,
            22,
            2771,
            24,
            11335,
            20,
            18,
            9225,
            2198,
            9,
            69,
            27,
            442,
            22,
            2771,
            24,
            11335,
            20,
            18,
            9225,
            2198,
            9,
            69,
            27,
            442,
            22,
            2771,
        ]
        #  In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria)
        #  are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich,
        #  narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin
        #  is asked by his father and a group of men to perform magic. Rasputin has a vision and
        #  denounces one of the men as a horse thief. Although his father initially slaps
        #  him for making such an accusation, Rasputin watches as the man is chased outside and beaten.
        #  Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest.
        #  Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing.
        #  <sep><cls>, Rasputin is asked to perform magic. He is asked to perform a ritual of the Virgin Mary.
        #  He is asked to perform a ritual of the Virgin Mary. He is asked to perform

        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
示例#30
0
def main():
    from transformers import XLNetConfig

    config = XLNetConfig(
        vocab_size=21_128,
        d_model=768,
        n_head=12,
        n_layer=6,
    )

    from transformers import XLNetTokenizer

    tokenizer = XLNetTokenizer.from_pretrained("./model/spbpe", max_len=512)

    from transformers import XLNetLMHeadModel

    model = XLNetLMHeadModel(config=config)
    model.resize_token_embeddings(len(tokenizer))
    print(model.num_parameters())

    from transformers import LineByLineTextDataset

    dataset = LineByLineTextDataset(
        tokenizer=tokenizer,
        file_path="./data/data_train.csv",
        block_size=128,
    )

    max_seq_length = 512

    from transformers import DataCollatorForPermutationLanguageModeling

    data_collator = DataCollatorForPermutationLanguageModeling(
        tokenizer=tokenizer, plm_probability=1.0 / 6, max_span_length=5)

    from transformers import Trainer, TrainingArguments

    training_args = TrainingArguments(
        output_dir="./model/xlnet_v1",
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_gpu_train_batch_size=32,
        save_steps=10_000,
        save_total_limit=2,
        tpu_num_cores=8,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=dataset,
        prediction_loss_only=True,
    )

    trainer.train()

    if trainer.is_world_master():
        trainer.save_model("./model/spbpe")

    print('FIN')