예제 #1
0
def get_config(args, vocabulary):
    if args.model_name in 'transfo-xl-wt103':
        config = TransfoXLConfig(vocab_size_or_config_json_file=args.vocab_size, cutoffs=[20000, 40000, 200000],
                                       d_model=512, d_embed=512, n_head=8, d_head=64, n_layer=12, d_inner=2048)
    else:
        config = AutoConfig.from_pretrained(args.model_name)

        config.vocab_size = args.vocab_size  # len(vocabulary.word2index.keys())
        config.n_positions = args.seq_size
      
    return config
예제 #2
0
def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
                                             transfo_xl_config_file,
                                             pytorch_dump_folder_path,
                                             transfo_xl_dataset_file):
    if transfo_xl_dataset_file:
        # Convert a pre-processed corpus (see original TensorFlow repo)
        with open(transfo_xl_dataset_file, "rb") as fp:
            corpus = pickle.load(fp, encoding="latin1")
        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
        pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES[
            "pretrained_vocab_file"]
        print(f"Save vocabulary to {pytorch_vocab_dump_path}")
        corpus_vocab_dict = corpus.vocab.__dict__
        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)

        corpus_dict_no_vocab = corpus.__dict__
        corpus_dict_no_vocab.pop("vocab", None)
        pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME
        print(f"Save dataset to {pytorch_dataset_dump_path}")
        torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path)

    if tf_checkpoint_path:
        # Convert a pre-trained TensorFlow model
        config_path = os.path.abspath(transfo_xl_config_file)
        tf_path = os.path.abspath(tf_checkpoint_path)

        print(
            f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}."
        )
        # Initialise PyTorch model
        if transfo_xl_config_file == "":
            config = TransfoXLConfig()
        else:
            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
        print(f"Building PyTorch model from configuration: {config}")
        model = TransfoXLLMHeadModel(config)

        model = load_tf_weights_in_transfo_xl(model, config, tf_path)
        # Save pytorch-model
        pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path,
                                                 WEIGHTS_NAME)
        pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path,
                                                CONFIG_NAME)
        print(
            f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}"
        )
        torch.save(model.state_dict(), pytorch_weights_dump_path)
        print(
            f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}"
        )
        with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
            f.write(config.to_json_string())
예제 #3
0
    def prepare_config_and_inputs(self):
        input_ids_1 = ids_tensor([self.batch_size, self.seq_length],
                                 self.vocab_size)
        input_ids_2 = ids_tensor([self.batch_size, self.seq_length],
                                 self.vocab_size)

        lm_labels = None
        if self.use_labels:
            lm_labels = ids_tensor([self.batch_size, self.seq_length],
                                   self.vocab_size)

        config = TransfoXLConfig(
            vocab_size=self.vocab_size,
            mem_len=self.mem_len,
            clamp_len=self.clamp_len,
            cutoffs=self.cutoffs,
            d_model=self.hidden_size,
            d_embed=self.d_embed,
            n_head=self.num_attention_heads,
            d_head=self.d_head,
            d_inner=self.d_inner,
            div_val=self.div_val,
            n_layer=self.num_hidden_layers,
            eos_token_id=self.eos_token_id,
            return_dict=True,
        )

        return (config, input_ids_1, input_ids_2, lm_labels)
 def __init__(self):
   super(Model, self).__init__()
   self.config = TransfoXLConfig(vocab_size_or_config_json_file='../model_configs/classification_XL_configuration.json')
   self.config.vocab_size=204098
   self.config.output_attentions=True
   self.model = TransfoXLModel(self.config)
   self.out_layer = torch.nn.Linear(self.model.d_model, 2)
예제 #5
0
 def __init__(self):
     super(Model, self).__init__()
     self.config = TransfoXLConfig(
         vocab_size_or_config_json_file=len(vocab) + 267735,
         n_heads=8,
         n_layers=9)
     self.model = TransfoXLModel(self.config)
     self.tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
     self.out_layer = torch.nn.Linear(self.model.d_model, 2)
	def __init__(self, ntoken,noutputs, d_model, nhead, d_ffn, nlayers, dropout=0.5, use_embedding=False):
		super(TransformerXLModel, self).__init__()
		self.config = TransfoXLConfig(	vocab_size = ntoken, cutoffs = [],
									 	d_model = d_model, d_embed = d_model,
										n_head = nhead, d_inner = d_ffn,
									 	n_layer = nlayers, tie_weights = False,
										d_head = d_model // nhead,adaptive = False,
										dropout = dropout)
		
		self.transformer_encoder = TransfoXLModel(self.config)
		self.decoder = nn.Linear(d_model, noutputs)
		self.sigmoid= nn.Sigmoid()
 def get_config(self):
     return TransfoXLConfig(
         vocab_size=self.vocab_size,
         mem_len=self.mem_len,
         clamp_len=self.clamp_len,
         cutoffs=self.cutoffs,
         d_model=self.hidden_size,
         d_embed=self.d_embed,
         n_head=self.num_attention_heads,
         d_head=self.d_head,
         d_inner=self.d_inner,
         div_val=self.div_val,
         n_layer=self.num_hidden_layers,
         eos_token_id=self.eos_token_id,
         pad_token_id=self.pad_token_id,
     )
예제 #8
0
def get_bert_config(bert_model_type, output_hidden_states=False):
    if bert_model_type in [
            'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased',
            'bert-large-uncased', 'tune_bert-base-uncased_nsp',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        bert_config = BertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'roberta-base', 'prod-roberta-base-cased', 'roberta-large',
            'roberta-large-mnli', 'distilroberta-base'
    ]:
        bert_config = RobertaConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['xlnet-base-cased']:
        bert_config = XLNetConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        bert_config = AlbertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        bert_config = GPT2Config.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['transfo-xl']:
        bert_config = TransfoXLConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        bert_config = DistilBertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')

    bert_config.output_hidden_states = output_hidden_states
    return bert_config
예제 #9
0
    def __init__(self, embed_size, encoder_output_size, hidden_size, n_head,
                 n_layers, mem_len, vocab, config):
        super(DecoderStory, self).__init__()

        vocab_size = len(vocab)
        self.vocab = vocab

        self.xl_config = TransfoXLConfig(vocab_size=vocab_size,
                                         d_model=hidden_size,
                                         d_embed=embed_size,
                                         n_head=n_head,
                                         div_val=1,
                                         n_layer=n_layers,
                                         tgt_len=50,
                                         mem_len=mem_len,
                                         adaptive=False)

        self.embed_size = embed_size
        self.encoder_output_size = encoder_output_size
        self.hidden_size = hidden_size
        self.mem_len = mem_len
        self.n_layers = n_layers
        self.padding_len = mem_len - 1

        self.fuse_linear = nn.Linear(hidden_size + encoder_output_size,
                                     hidden_size)
        self.classifier = nn.Linear(hidden_size, vocab_size)
        self.dropout = nn.Dropout(p=0.5)
        self.transformer_xl = TransfoXLModel(self.xl_config)
        self.softmax = nn.Softmax(0)

        # define start vector for a sentence
        self.start_vec = torch.zeros([1, vocab_size], dtype=torch.float32)
        self.start_vec[0][1] = 10000
        if torch.cuda.is_available():
            self.start_vec = self.start_vec.cuda()
예제 #10
0
tie_projs += [True] * len(cutoffs)

model_config_base = {
    'dropout': 0.1,
    'dropatt': 0.0,
    'tie_weight': False,
    'div_val': 1,
    'pre_lnorm': True,
    'cutoffs': cutoffs,
    'clamp_len': 400,
}

from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel, TransfoXLConfig

# Initializing a Transformer XL configuration
configuration = TransfoXLConfig.from_dict(model_config_base)
# To match with pre-trained model
configuration.d_embed, configuration.d_head = 512, 64
configuration.d_inner, configuration.d_model = 2048, 512
configuration.mem_len, configuration.n_head = 192, 8
configuration.n_layer, configuration.tgt_len = 16, 192
configuration.vocab_size = 32000

model = TransfoXLLMHeadModel.from_pretrained(
    pretrained_model_name_or_path=None,
    state_dict=ckpt['model_state'],
    config=configuration)

from transformers import PreTrainedTokenizer
from utils.tokenization_sentencepiece import FullTokenizer
from collections import Counter, OrderedDict
예제 #11
0
from transformers import TransfoXLConfig, TransfoXLModel, TransfoXLTokenizer
import pandas as pd
import torch
import numpy as np
import scipy.spatial.distance as distance

# 18-layer, 1024-hidden, 16-heads, 257M parameters.
# English model trained on wikitext-103
# Initializing a Transformer XL configuration

configuration = TransfoXLConfig().from_pretrained("transfo-xl-wt103")
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
# Initializing a model from the configuration
model = TransfoXLModel.from_pretrained("transfo-xl-wt103",
                                       config=configuration)

## extract the features
dataset = pd.read_csv('./data/data.csv')[:10]
print(dataset.shape)
pages = dataset['desp'].values.tolist()
print("the dataset is:\n", pages)

saved_features = []
for val in pages:
    input_ids = torch.tensor(tokenizer.encode(
        val, add_special_tokens=True)).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs[
        0]  # dimension is (1, sequence length, hidden states)
    # average the hidden states to get the semantic content of the input
    extracted_features = torch.mean(
예제 #12
0
def configure(pytorch_chkp=None):
    """
    Configure and load the model
    :param pytorch_chkp: Optional parameter for a pre-trained pytorch model checkpoint
    :return: The model
    """

    # Quick config for running different training configurations
    config1 = True
    if config1:
        n_head = 8
        n_layer = 12
        d_inner = 2048
        dropout = 0.1
    else:
        n_head = 6
        n_layer = 8
        d_inner = 1024
        dropout = 0.3

    # Initialise the Transformer XL configuration
    configuration = TransfoXLConfig(
        # Number of tokens
        n_token=n_token,
        # Number of self-attention layers for encoder and decoder
        n_layer=n_layer,
        # Number of attention heads for each attention layer in encoder
        n_head=n_head,
        # Length of the model's hidden states
        d_model=input_length,
        # Dimensionality of the model's heads
        d_head=input_length // n_head,
        # Inner dimension in feed-forward layer
        d_inner=d_inner,
        # Dropout probability
        dropout=dropout,
        # Dropout for attention probabilities
        dropatt=dropout,
        # Length of the retained previous heads
        mem_len=input_length,
        # Dimensionality/length of embeddings
        d_embed=input_length,
        # Length of target logits for classification
        tgt_len=input_length,
        # Length of the extended context
        ext_len=input_length,
        # Cutoffs for the adaptive softmax
        cutoffs=[],
        # Divident value for adapative input and softmax
        div_val=-1,
        # Use the same positional embeddings after clamp_len
        clamp_len=-1,
        # Whether to use the same attention length for all tokens
        same_length=False,
        # Number of samples in the sampled softmax
        sample_softmax=1,
        # Tie encoder weights to decoder weights
        tie_weight=True,
        tie_encoder_decoder=True,
        tie_word_embeddings=True,
        # Tie encoder biases to decoder biases
        untie_r=True,
        # Number of labels used for classification in the last layer
        num_labels=308,
        proj_share_all_but_first=False,
        # Make sure that this is greater than n_token!
        pad_token_id=309)

    # Initialise the model from the configuration
    model = TransfoXLForSequenceClassification(configuration)

    # Load a pre-trained checkpoint if it exists
    if pytorch_chkp is not None:
        model.load_state_dict(torch.load(pytorch_chkp, map_location=device))
        print("Loaded model checkpoint ", pytorch_chkp)

        # Apply model quantisation to massively speed up inference during testing/generating on CPUs
        if device.type != 'cuda':
            # Block the warning
            import warnings
            warnings.filterwarnings(
                "ignore",
                message="Setting attributes on ParameterList is not supported."
            )
            # Only quantise testing model, not during training.
            # Also, for some reason quantisation doesn't play well with all Nvidia GPUs
            model = torch.quantization.quantize_dynamic(model, {
                torch.nn.Linear, torch.nn.Softmax, torch.nn.Embedding,
                torch.nn.Dropout
            },
                                                        dtype=torch.qint8)

    return model.to(device)  # Set model to graphics device (cpu/gpu)