def __init__(self, vocab: str, pretrained_model: str, use_starting_offsets: bool = False, do_lowercase: bool = False, never_lowercase: List[str] = None, max_pieces: int = 512, truncate_long_sequences: bool = False) -> None: if pretrained_model.endswith("-cased") and do_lowercase: logger.warning("Your BERT model appears to be cased, " "but your indexer is lowercasing tokens.") elif pretrained_model.endswith("-uncased") and not do_lowercase: logger.warning("Your BERT model appears to be uncased, " "but your indexer is not lowercasing tokens.") xlm_tokenizer = XLMTokenizer.from_pretrained( pretrained_model, do_lower_case=do_lowercase) super().__init__(vocab=vocab, wordpiece_tokenizer=xlm_tokenizer, namespace="xlm", use_starting_offsets=use_starting_offsets, max_pieces=max_pieces, do_lowercase=do_lowercase, never_lowercase=never_lowercase, start_tokens=[xlm_tokenizer.cls_token], end_tokens=[xlm_tokenizer.sep_token], separator_token=xlm_tokenizer.sep_token, truncate_long_sequences=truncate_long_sequences, lang_dict=xlm_tokenizer.lang2id)
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def __init__(self, vocab: Vocabulary, tasks: List[str], pretrained_model: str, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, decoders: Dict[str, Model], post_encoder_embedder: TextFieldEmbedder = None, dropout: float = 0.0, word_dropout: float = 0.0, mix_embedding: int = None, layer_dropout: int = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None) -> None: super(UdifyModel, self).__init__(vocab, regularizer) self.tasks = tasks self.vocab = vocab self.text_field_embedder = text_field_embedder self.post_encoder_embedder = post_encoder_embedder self.shared_encoder = encoder self.word_dropout = word_dropout self.dropout = torch.nn.Dropout(p=dropout) self.decoders = torch.nn.ModuleDict(decoders) if 'bert' in pretrained_model: self.tokenizer = BertTokenizer.from_pretrained(pretrained_model, do_lower_case=False) elif 'xlm' in pretrained_model: self.tokenizer = XLMTokenizer.from_pretrained(pretrained_model, do_lower_case=False) else: raise ConfigurationError( f"No corresponding pretrained model for tokenizer.") if mix_embedding: self.scalar_mix = torch.nn.ModuleDict({ task: ScalarMixWithDropout(mix_embedding, do_layer_norm=False, dropout=layer_dropout) for task in self.decoders }) else: self.scalar_mix = None self.metrics = {} for task in self.tasks: if task not in self.decoders: raise ConfigurationError( f"Task {task} has no corresponding decoder. Make sure their names match." ) check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim") initializer(self) self._count_params()
def add_pytorch_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in pytorch_transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name if tokenizer_name.startswith("bert-"): tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token if tokenizer_name.startswith("roberta-"): if tokenizer.convert_ids_to_tokens(vocab_size - 1) is None: vocab_size -= 1 else: log.info("Time to delete vocab_size-1 in preprocess.py !!!") # due to a quirk in huggingface's file, the last token of RobertaTokenizer is None, remove # this when they fix the problem ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added pytorch_transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')): super(XLMClient, self).__init__() self.chunck_size = chunck_size self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') self.max_length = max_length # load the model self.model = XLMModel.from_pretrained('xlm-mlm-en-2048') self.model.eval() self.device = device # move model to device self.model.to(self.device)
def __init__(self): self.device = None self.n_gpu = None self.model = None self.processor = None self.evaluator = None self.parser = argparse.ArgumentParser() self.setup_args() self.args = self.parser.parse_args() self.logger = None self.setup_misc() if self.args.encoder == 'bert': self.tokenizer = BertTokenizer.from_pretrained(self.args.bert_mode) elif self.args.encoder == 'xlm': self.logger.warn( 'WE ASSUME THE TEXT IS PRETOKENIZED AND ONLY DO THE FOLLOWING:' ) self.logger.warn( '1. Remove invalid character and clean up white space') self.logger.warn( '2. Lower case, renormalize unicode and strip accents') self.logger.warn( '3. Split text by white space and do BPE on each "word"') if self.args.diff_bpe: if self.args.lang_prefix: self.tokenizer = LangPrefixXLMTokenizer.from_pretrained( self.args.xlm_mode, do_lowercase_and_remove_accent=self.args.do_lower_case) else: self.tokenizer = NewXLMTokenizer.from_pretrained( self.args.xlm_mode, do_lowercase_and_remove_accent=self.args.do_lower_case) else: self.tokenizer = XLMTokenizer.from_pretrained( self.args.xlm_mode, do_lowercase_and_remove_accent=self.args.do_lower_case) else: raise ValueError(self.args.encoder) self.param_optimizer, self.optimizer, self.scheduler = None, None, None self.global_step, self.total_step = 0, 0 self.num_train_steps, self.warmup_linear = None, None self.last_eval, self.best_eval = 0, float('-inf')
def test_xlm_embeddings(): xlm_model: str = "xlm-mlm-en-2048" tokenizer = XLMTokenizer.from_pretrained(xlm_model) model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize("<s>" + s + "</s>") indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # # <s> 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s> # | | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = XLMEmbeddings( model=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[10].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[10]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9], first_layer[10]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * model.embeddings.embedding_dim actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * model.embeddings.embedding_dim actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
def test_xlm_embeddings(): xlm_model = 'xlm-mlm-en-2048' tokenizer = XLMTokenizer.from_pretrained(xlm_model) model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model, output_hidden_states=True) model.to(flair.device) model.eval() s = 'Berlin and Munich have a lot of puppeteer to see .' with torch.no_grad(): tokens = tokenizer.tokenize((('<s>' + s) + '</s>')) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[(-1)] first_layer = hidden_states[1][0] assert (len(first_layer) == len(tokens)) def embed_sentence(sentence: str, pooling_operation, layers: str = '1', use_scalar_mix: bool = False) -> Sentence: embeddings = XLMEmbeddings(pretrained_model_name_or_path=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence sentence_first_subword = embed_sentence(sentence=s, pooling_operation='first') first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) sentence_last_subword = embed_sentence(sentence=s, pooling_operation='last') first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[10].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation='first_last') first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[10]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) sentence_mean_subword = embed_sentence(sentence=s, pooling_operation='mean') first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9], first_layer[10]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert (first_token_embedding_ref == first_token_embedding_actual) assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) sentence_mult_layers = embed_sentence(sentence='Munich', pooling_operation='first', layers='1,2,3,4') ref_embedding_size = (4 * model.embeddings.embedding_dim) actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size) sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin', pooling_operation='first', layers='1,2,3,4', use_scalar_mix=True) ref_embedding_size = (1 * model.embeddings.embedding_dim) actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert (ref_embedding_size == actual_embedding_size)