def get_config(args, vocabulary): if args.model_name in 'transfo-xl-wt103': config = TransfoXLConfig(vocab_size_or_config_json_file=args.vocab_size, cutoffs=[20000, 40000, 200000], d_model=512, d_embed=512, n_head=8, d_head=64, n_layer=12, d_inner=2048) else: config = AutoConfig.from_pretrained(args.model_name) config.vocab_size = args.vocab_size # len(vocabulary.word2index.keys()) config.n_positions = args.seq_size return config
def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, transfo_xl_config_file, pytorch_dump_folder_path, transfo_xl_dataset_file): if transfo_xl_dataset_file: # Convert a pre-processed corpus (see original TensorFlow repo) with open(transfo_xl_dataset_file, "rb") as fp: corpus = pickle.load(fp, encoding="latin1") # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term) pytorch_vocab_dump_path = pytorch_dump_folder_path + "/" + VOCAB_FILES_NAMES[ "pretrained_vocab_file"] print(f"Save vocabulary to {pytorch_vocab_dump_path}") corpus_vocab_dict = corpus.vocab.__dict__ torch.save(corpus_vocab_dict, pytorch_vocab_dump_path) corpus_dict_no_vocab = corpus.__dict__ corpus_dict_no_vocab.pop("vocab", None) pytorch_dataset_dump_path = pytorch_dump_folder_path + "/" + CORPUS_NAME print(f"Save dataset to {pytorch_dataset_dump_path}") torch.save(corpus_dict_no_vocab, pytorch_dataset_dump_path) if tf_checkpoint_path: # Convert a pre-trained TensorFlow model config_path = os.path.abspath(transfo_xl_config_file) tf_path = os.path.abspath(tf_checkpoint_path) print( f"Converting Transformer XL checkpoint from {tf_path} with config at {config_path}." ) # Initialise PyTorch model if transfo_xl_config_file == "": config = TransfoXLConfig() else: config = TransfoXLConfig.from_json_file(transfo_xl_config_file) print(f"Building PyTorch model from configuration: {config}") model = TransfoXLLMHeadModel(config) model = load_tf_weights_in_transfo_xl(model, config, tf_path) # Save pytorch-model pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) print( f"Save PyTorch model to {os.path.abspath(pytorch_weights_dump_path)}" ) torch.save(model.state_dict(), pytorch_weights_dump_path) print( f"Save configuration file to {os.path.abspath(pytorch_config_dump_path)}" ) with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: f.write(config.to_json_string())
def prepare_config_and_inputs(self): input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) lm_labels = None if self.use_labels: lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) config = TransfoXLConfig( vocab_size=self.vocab_size, mem_len=self.mem_len, clamp_len=self.clamp_len, cutoffs=self.cutoffs, d_model=self.hidden_size, d_embed=self.d_embed, n_head=self.num_attention_heads, d_head=self.d_head, d_inner=self.d_inner, div_val=self.div_val, n_layer=self.num_hidden_layers, eos_token_id=self.eos_token_id, return_dict=True, ) return (config, input_ids_1, input_ids_2, lm_labels)
def __init__(self): super(Model, self).__init__() self.config = TransfoXLConfig(vocab_size_or_config_json_file='../model_configs/classification_XL_configuration.json') self.config.vocab_size=204098 self.config.output_attentions=True self.model = TransfoXLModel(self.config) self.out_layer = torch.nn.Linear(self.model.d_model, 2)
def __init__(self): super(Model, self).__init__() self.config = TransfoXLConfig( vocab_size_or_config_json_file=len(vocab) + 267735, n_heads=8, n_layers=9) self.model = TransfoXLModel(self.config) self.tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') self.out_layer = torch.nn.Linear(self.model.d_model, 2)
def __init__(self, ntoken,noutputs, d_model, nhead, d_ffn, nlayers, dropout=0.5, use_embedding=False): super(TransformerXLModel, self).__init__() self.config = TransfoXLConfig( vocab_size = ntoken, cutoffs = [], d_model = d_model, d_embed = d_model, n_head = nhead, d_inner = d_ffn, n_layer = nlayers, tie_weights = False, d_head = d_model // nhead,adaptive = False, dropout = dropout) self.transformer_encoder = TransfoXLModel(self.config) self.decoder = nn.Linear(d_model, noutputs) self.sigmoid= nn.Sigmoid()
def get_config(self): return TransfoXLConfig( vocab_size=self.vocab_size, mem_len=self.mem_len, clamp_len=self.clamp_len, cutoffs=self.cutoffs, d_model=self.hidden_size, d_embed=self.d_embed, n_head=self.num_attention_heads, d_head=self.d_head, d_inner=self.d_inner, div_val=self.div_val, n_layer=self.num_hidden_layers, eos_token_id=self.eos_token_id, pad_token_id=self.pad_token_id, )
def get_bert_config(bert_model_type, output_hidden_states=False): if bert_model_type in [ 'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased', 'bert-large-uncased', 'tune_bert-base-uncased_nsp', 'bert-large-uncased-whole-word-masking', 'bert-large-uncased-whole-word-masking-finetuned-squad' ]: bert_config = BertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'roberta-base', 'prod-roberta-base-cased', 'roberta-large', 'roberta-large-mnli', 'distilroberta-base' ]: bert_config = RobertaConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['xlnet-base-cased']: bert_config = XLNetConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1', 'albert-xxlarge-v1' ]: bert_config = AlbertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['gpt2', 'gpt2-medium']: bert_config = GPT2Config.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in ['transfo-xl']: bert_config = TransfoXLConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) elif bert_model_type in [ 'distilbert-base-uncased', 'distilbert-base-uncased-distilled-squad' ]: bert_config = DistilBertConfig.from_pretrained( BERT_CONFIG_FILE[bert_model_type]) else: raise ValueError( f'`bert_model_type` not understood: {bert_model_type}') bert_config.output_hidden_states = output_hidden_states return bert_config
def __init__(self, embed_size, encoder_output_size, hidden_size, n_head, n_layers, mem_len, vocab, config): super(DecoderStory, self).__init__() vocab_size = len(vocab) self.vocab = vocab self.xl_config = TransfoXLConfig(vocab_size=vocab_size, d_model=hidden_size, d_embed=embed_size, n_head=n_head, div_val=1, n_layer=n_layers, tgt_len=50, mem_len=mem_len, adaptive=False) self.embed_size = embed_size self.encoder_output_size = encoder_output_size self.hidden_size = hidden_size self.mem_len = mem_len self.n_layers = n_layers self.padding_len = mem_len - 1 self.fuse_linear = nn.Linear(hidden_size + encoder_output_size, hidden_size) self.classifier = nn.Linear(hidden_size, vocab_size) self.dropout = nn.Dropout(p=0.5) self.transformer_xl = TransfoXLModel(self.xl_config) self.softmax = nn.Softmax(0) # define start vector for a sentence self.start_vec = torch.zeros([1, vocab_size], dtype=torch.float32) self.start_vec[0][1] = 10000 if torch.cuda.is_available(): self.start_vec = self.start_vec.cuda()
tie_projs += [True] * len(cutoffs) model_config_base = { 'dropout': 0.1, 'dropatt': 0.0, 'tie_weight': False, 'div_val': 1, 'pre_lnorm': True, 'cutoffs': cutoffs, 'clamp_len': 400, } from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel, TransfoXLConfig # Initializing a Transformer XL configuration configuration = TransfoXLConfig.from_dict(model_config_base) # To match with pre-trained model configuration.d_embed, configuration.d_head = 512, 64 configuration.d_inner, configuration.d_model = 2048, 512 configuration.mem_len, configuration.n_head = 192, 8 configuration.n_layer, configuration.tgt_len = 16, 192 configuration.vocab_size = 32000 model = TransfoXLLMHeadModel.from_pretrained( pretrained_model_name_or_path=None, state_dict=ckpt['model_state'], config=configuration) from transformers import PreTrainedTokenizer from utils.tokenization_sentencepiece import FullTokenizer from collections import Counter, OrderedDict
from transformers import TransfoXLConfig, TransfoXLModel, TransfoXLTokenizer import pandas as pd import torch import numpy as np import scipy.spatial.distance as distance # 18-layer, 1024-hidden, 16-heads, 257M parameters. # English model trained on wikitext-103 # Initializing a Transformer XL configuration configuration = TransfoXLConfig().from_pretrained("transfo-xl-wt103") tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') # Initializing a model from the configuration model = TransfoXLModel.from_pretrained("transfo-xl-wt103", config=configuration) ## extract the features dataset = pd.read_csv('./data/data.csv')[:10] print(dataset.shape) pages = dataset['desp'].values.tolist() print("the dataset is:\n", pages) saved_features = [] for val in pages: input_ids = torch.tensor(tokenizer.encode( val, add_special_tokens=True)).unsqueeze(0) outputs = model(input_ids) last_hidden_states = outputs[ 0] # dimension is (1, sequence length, hidden states) # average the hidden states to get the semantic content of the input extracted_features = torch.mean(
def configure(pytorch_chkp=None): """ Configure and load the model :param pytorch_chkp: Optional parameter for a pre-trained pytorch model checkpoint :return: The model """ # Quick config for running different training configurations config1 = True if config1: n_head = 8 n_layer = 12 d_inner = 2048 dropout = 0.1 else: n_head = 6 n_layer = 8 d_inner = 1024 dropout = 0.3 # Initialise the Transformer XL configuration configuration = TransfoXLConfig( # Number of tokens n_token=n_token, # Number of self-attention layers for encoder and decoder n_layer=n_layer, # Number of attention heads for each attention layer in encoder n_head=n_head, # Length of the model's hidden states d_model=input_length, # Dimensionality of the model's heads d_head=input_length // n_head, # Inner dimension in feed-forward layer d_inner=d_inner, # Dropout probability dropout=dropout, # Dropout for attention probabilities dropatt=dropout, # Length of the retained previous heads mem_len=input_length, # Dimensionality/length of embeddings d_embed=input_length, # Length of target logits for classification tgt_len=input_length, # Length of the extended context ext_len=input_length, # Cutoffs for the adaptive softmax cutoffs=[], # Divident value for adapative input and softmax div_val=-1, # Use the same positional embeddings after clamp_len clamp_len=-1, # Whether to use the same attention length for all tokens same_length=False, # Number of samples in the sampled softmax sample_softmax=1, # Tie encoder weights to decoder weights tie_weight=True, tie_encoder_decoder=True, tie_word_embeddings=True, # Tie encoder biases to decoder biases untie_r=True, # Number of labels used for classification in the last layer num_labels=308, proj_share_all_but_first=False, # Make sure that this is greater than n_token! pad_token_id=309) # Initialise the model from the configuration model = TransfoXLForSequenceClassification(configuration) # Load a pre-trained checkpoint if it exists if pytorch_chkp is not None: model.load_state_dict(torch.load(pytorch_chkp, map_location=device)) print("Loaded model checkpoint ", pytorch_chkp) # Apply model quantisation to massively speed up inference during testing/generating on CPUs if device.type != 'cuda': # Block the warning import warnings warnings.filterwarnings( "ignore", message="Setting attributes on ParameterList is not supported." ) # Only quantise testing model, not during training. # Also, for some reason quantisation doesn't play well with all Nvidia GPUs model = torch.quantization.quantize_dynamic(model, { torch.nn.Linear, torch.nn.Softmax, torch.nn.Embedding, torch.nn.Dropout }, dtype=torch.qint8) return model.to(device) # Set model to graphics device (cpu/gpu)