示例#1
0
 def test_lm_generate_xlm_mlm_en_2048(self):
     model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048")
     model.to(torch_device)
     input_ids = torch.tensor([[14, 447]],
                              dtype=torch.long,
                              device=torch_device)  # the president
     expected_output_ids = [
         14,
         447,
         14,
         447,
         14,
         447,
         14,
         447,
         14,
         447,
         14,
         447,
         14,
         447,
         14,
         447,
         14,
         447,
         14,
         447,
     ]  # the president the president the president the president the president the president the president the president the president the president
     # TODO(PVP): this and other input_ids I tried for generation give pretty bad results. Not sure why. Model might just not be made for auto-regressive inference
     output_ids = model.generate(input_ids, do_sample=False)
     self.assertListEqual(output_ids[0].cpu().numpy().tolist(),
                          expected_output_ids)
    def load_model_tokenizer(self, pretrained):
        """ Load transformer model and tokenizer for given pre-trained name 
        
        :param pretrained: pre-trained name
        :return: model, tokenizer
        """
        
        model = None
        tokenizer = None
        
        if self.method == "T5":
            if pretrained in T5_PRETRAINED_MODELS:
                model = T5ForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = T5Tokenizer.from_pretrained(pretrained)
        elif self.method == "BART":
            if pretrained in BART_PRETRAINED_MODELS:
                model = BartForConditionalGeneration.from_pretrained(pretrained)
                tokenizer = BartTokenizer.from_pretrained(pretrained)
        elif self.method == "GPT-2":
            if pretrained in GPT2_PRETRAINED_MODELS:
                model = GPT2LMHeadModel.from_pretrained(pretrained)
                model.config.max_length = self.max_length
                tokenizer = GPT2Tokenizer.from_pretrained(pretrained)
        elif self.method == "XLM":
            if pretrained in XLM_PRETRAINED_MODELS:
                model = XLMWithLMHeadModel.from_pretrained(pretrained)
                model.config.max_length = self.max_length
                tokenizer = XLMTokenizer.from_pretrained(pretrained)
        else:
            pass

        return model, tokenizer
示例#3
0
    def test_lm_generate_xlm_mlm_en_2048(self):
        model = XLMWithLMHeadModel.from_pretrained("xlm-mlm-en-2048")
        input_ids = torch.Tensor([[1, 14, 2232, 26,
                                   1]]).long()  # The dog is cute
        expected_output_ids = [
            1,
            14,
            2232,
            26,
            1,
            567,
            26,
            32,
            149,
            149,
            149,
            149,
            149,
            149,
            149,
            149,
            149,
            149,
            149,
            149,
        ]  # The dog is nothing is it!!!!!!!!!!!! TODO (PVP): this sentence (and others I tried) does not make much sense, there seems to be a problem with xlm language generation.
        torch.manual_seed(0)

        output_ids = model.generate(input_ids)

        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
示例#4
0
        def create_and_check_xlm_lm_head(self, config, input_ids,
                                         token_type_ids, input_lengths,
                                         sequence_labels, token_labels,
                                         is_impossible_labels, input_mask):
            model = XLMWithLMHeadModel(config)
            model.eval()

            loss, logits = model(input_ids,
                                 token_type_ids=token_type_ids,
                                 labels=token_labels)

            result = {
                "loss": loss,
                "logits": logits,
            }

            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
                list(result["logits"].size()),
                [self.batch_size, self.seq_length, self.vocab_size])
    def create_and_check_xlm_lm_head(
        self,
        config,
        input_ids,
        token_type_ids,
        input_lengths,
        sequence_labels,
        token_labels,
        is_impossible_labels,
        choice_labels,
        input_mask,
    ):
        model = XLMWithLMHeadModel(config)
        model.to(torch_device)
        model.eval()

        result = model(input_ids, token_type_ids=token_type_ids, labels=token_labels)
        self.parent.assertEqual(result.loss.shape, ())
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
filename = "SUBTLEXus74286wordstextversion.txt"
vocab = get_vocab(filename, 3000)
rw_vocab = get_vocab(filename, 10000)

filename2 = "SUBTLEX-US frequency list with PoS information text version.txt"
pos_dict = get_pos_dict(filename2)

GPT2 = ModelInfo(GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True),
                 GPT2Tokenizer.from_pretrained('gpt2'), "Ġ", vocab, "GTP2")

Roberta = ModelInfo(
    RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True),
    RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta")

XLM = ModelInfo(
    XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024',
                                       return_dict=True),
    XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM")

T5 = ModelInfo(
    T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True),
    T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5")

Albert = ModelInfo(
    AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True),
    AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert")

TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'),
                TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_",
                vocab, "TXL")

if __name__ == "__main__":
示例#7
0
    tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False)
    model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config)
    # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture

elif args.LM == 'XLM':
    from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel

    config = XLMConfig(vocab_size=64139,
                       emb_dim=1024,
                       max_position_embeddings=512,
                       n_heads=8,
                       n_layers=6,
                       )

    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False)
    model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config)
    # 6-layer, 1024-hidden, 8-heads
    # XLM English-French model trained on the concatenation of English and French wikipedia

else:
    print('need to define LM from Bert,RoBerta,XLM')

print(model)

def freeze_layer_fun(freeze_layer):
    for name, param in model.named_parameters():
        if freeze_layer in name:
            print(name)
            param.requires_grad = False
        else:
            pass
示例#8
0
    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024', do_lower_case=False)
    model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024', config=config)
    # 12-layer, 1024-hidden, 8-heads
    # XLM Model pre-trained with MLM on the 15 XNLI languages.
    '''
    config = XLMConfig(
        vocab_size=64139,
        emb_dim=1024,
        max_position_embeddings=512,
        n_heads=8,
        n_layers=6,
    )

    tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024',
                                             do_lower_case=False)
    model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-enfr-1024',
                                               config=config)
    #6-layer, 1024-hidden, 8-heads
    # XLM English-French model trained on the concatenation of English and French wikipedia

else:
    print('need to define LM from Bert,RoBerta,XLM')
'''
from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM, AdamW

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased', do_lower_case=False)
'''

train_inputs = torch.Tensor()
train_masks = torch.Tensor()
for sent in sentences_train:
    encoded_sent = tokenizer.encode_plus(
示例#9
0
    train, val, test = Multi30k.splits(root='../../data',
                                       exts=('.en', '.de'),
                                       fields=(en_text, de_text))

    en_text.build_vocab(train, max_size=30000, min_freq=3)
    de_text.build_vocab(train, max_size=30000, min_freq=3)
    vocab_en = en_text.vocab
    vocab_de = de_text.vocab
    pad_idx = vocab_de.stoi['<pad>']

    train_ldr, val_ldr, test_ldr = BucketIterator.splits((train, val, test),
                                                         batch_size=5)

    # load model
    xlm = XLMWithLMHeadModel.from_pretrained('xlm-mlm-ende-1024')
    xlm.transformer.embeddings = nn.Embedding(len(vocab_en),
                                              xlm.config.emb_dim,
                                              padding_idx=pad_idx)
    xlm.pred_layer.proj = nn.Linear(xlm.config.emb_dim,
                                    len(vocab_de),
                                    bias=True)
    xlm.cuda()

    xent = nn.CrossEntropyLoss()

    batch = next(iter(train_ldr))
    src, trg = batch.src.to(0), batch.trg.to(0)

    def mt_loss(out, target):
        # only compute loss for non-padding indices
示例#10
0
import torch

from transformers import XLMTokenizer, XLMWithLMHeadModel

import logging
logging.basicConfig(level=logging.INFO)

tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-17-1280')
model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-17-1280')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(
    0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states = outputs[
    0]  # The last hidden-state is the first element of the output tuple

print(last_hidden_states)
print(last_hidden_states.shape)
predicted_index = torch.argmax(last_hidden_states).item()
print(predicted_index)
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])
print(predicted_token)