def create_and_check_transfo_xl_for_sequence_classification(self, config, input_ids_1, input_ids_2, lm_labels): config.num_labels = self.num_labels model = TransfoXLForSequenceClassification(config) model.to(torch_device) model.eval() result = model(input_ids_1) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def configure(pytorch_chkp=None): """ Configure and load the model :param pytorch_chkp: Optional parameter for a pre-trained pytorch model checkpoint :return: The model """ # Quick config for running different training configurations config1 = True if config1: n_head = 8 n_layer = 12 d_inner = 2048 dropout = 0.1 else: n_head = 6 n_layer = 8 d_inner = 1024 dropout = 0.3 # Initialise the Transformer XL configuration configuration = TransfoXLConfig( # Number of tokens n_token=n_token, # Number of self-attention layers for encoder and decoder n_layer=n_layer, # Number of attention heads for each attention layer in encoder n_head=n_head, # Length of the model's hidden states d_model=input_length, # Dimensionality of the model's heads d_head=input_length // n_head, # Inner dimension in feed-forward layer d_inner=d_inner, # Dropout probability dropout=dropout, # Dropout for attention probabilities dropatt=dropout, # Length of the retained previous heads mem_len=input_length, # Dimensionality/length of embeddings d_embed=input_length, # Length of target logits for classification tgt_len=input_length, # Length of the extended context ext_len=input_length, # Cutoffs for the adaptive softmax cutoffs=[], # Divident value for adapative input and softmax div_val=-1, # Use the same positional embeddings after clamp_len clamp_len=-1, # Whether to use the same attention length for all tokens same_length=False, # Number of samples in the sampled softmax sample_softmax=1, # Tie encoder weights to decoder weights tie_weight=True, tie_encoder_decoder=True, tie_word_embeddings=True, # Tie encoder biases to decoder biases untie_r=True, # Number of labels used for classification in the last layer num_labels=308, proj_share_all_but_first=False, # Make sure that this is greater than n_token! pad_token_id=309) # Initialise the model from the configuration model = TransfoXLForSequenceClassification(configuration) # Load a pre-trained checkpoint if it exists if pytorch_chkp is not None: model.load_state_dict(torch.load(pytorch_chkp, map_location=device)) print("Loaded model checkpoint ", pytorch_chkp) # Apply model quantisation to massively speed up inference during testing/generating on CPUs if device.type != 'cuda': # Block the warning import warnings warnings.filterwarnings( "ignore", message="Setting attributes on ParameterList is not supported." ) # Only quantise testing model, not during training. # Also, for some reason quantisation doesn't play well with all Nvidia GPUs model = torch.quantization.quantize_dynamic(model, { torch.nn.Linear, torch.nn.Softmax, torch.nn.Embedding, torch.nn.Dropout }, dtype=torch.qint8) return model.to(device) # Set model to graphics device (cpu/gpu)