def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): model = XLMModel(config=config) model.eval() outputs = model(input_ids, lengths=input_lengths, langs=token_type_ids) outputs = model(input_ids, langs=token_type_ids) outputs = model(input_ids) sequence_output = outputs[0] result = { "sequence_output": sequence_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size])
def __init__(self, bert_config: str, requires_grad: bool = False, dropout: float = 0.1, layer_dropout: float = 0.1, combine_layers: str = "mix") -> None: #todo: control for XLM configs model = XLMModel(XLMConfig.from_json_file(bert_config)) for param in model.parameters(): param.requires_grad = requires_grad super().__init__(xlm_model=model, layer_dropout=layer_dropout, combine_layers=combine_layers) self.model = model self.dropout = dropout
def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')): super(XLMClient, self).__init__() self.chunck_size = chunck_size self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') self.max_length = max_length # load the model self.model = XLMModel.from_pretrained('xlm-mlm-en-2048') self.model.eval() self.device = device # move model to device self.model.to(self.device)
def __init__(self, pretrained_model: str, requires_grad: bool = False, dropout: float = 0.1, layer_dropout: float = 0.1, add_lang: bool = False, combine_layers: str = "mix") -> None: model = XLMModel.from_pretrained(pretrained_model, output_hidden_states=True, dropout=dropout, attention_dropout=dropout) for param in model.parameters(): param.requires_grad = requires_grad super().__init__(xlm_model=model, layer_dropout=layer_dropout, combine_layers=combine_layers, add_lang=add_lang) self.model = model self.dropout = dropout
def test_xlm_embeddings(): xlm_model: str = "xlm-mlm-en-2048" tokenizer = XLMTokenizer.from_pretrained(xlm_model) model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize("<s>" + s + "</s>") indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # # <s> 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s> # | | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = XLMEmbeddings( model=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[10].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[10]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9], first_layer[10]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * model.embeddings.embedding_dim actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * model.embeddings.embedding_dim actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
def test_model_from_pretrained(self): cache_dir = "/tmp/pytorch_transformers_test/" for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)
def test_xlm_embeddings(): xlm_model = 'xlm-mlm-en-2048' tokenizer = XLMTokenizer.from_pretrained(xlm_model) model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model, output_hidden_states=True) model.to(flair.device) model.eval() s = 'Berlin and Munich have a lot of puppeteer to see .' with torch.no_grad(): tokens = tokenizer.tokenize((('<s>' + s) + '</s>')) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[(-1)] first_layer = hidden_states[1][0] assert (len(first_layer) == len(tokens)) def embed_sentence(sentence: str, pooling_operation, layers: str = '1', use_scalar_mix: bool = False) -> Sentence: embeddings = XLMEmbeddings(pretrained_model_name_or_path=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence sentence_first_subword = embed_sentence(sentence=s, pooling_operation='first') first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) sentence_last_subword = embed_sentence(sentence=s, pooling_operation='last') first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[10].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation='first_last') first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[10]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) sentence_mean_subword = embed_sentence(sentence=s, pooling_operation='mean') first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9], first_layer[10]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) sentence_mult_layers = embed_sentence(sentence='Munich', pooling_operation='first', layers='1,2,3,4') ref_embedding_size = (4 * model.embeddings.embedding_dim) actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size) sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin', pooling_operation='first', layers='1,2,3,4', use_scalar_mix=True) ref_embedding_size = (1 * model.embeddings.embedding_dim) actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size)