def __init__(self, auto_model: str, auto_path: str): super().__init__() if "camembert" in auto_model: from transformers import CamembertModel, CamembertTokenizer self.auto_embeddings = CamembertModel.from_pretrained(auto_path) self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path) elif "flaubert" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) self.auto_tokenizer.do_lowercase_and_remove_accent = False elif "xlm" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) elif "bert" in auto_model: from transformers import BertModel, BertTokenizer self.auto_embeddings = BertModel.from_pretrained(auto_path) self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path) else: from transformers import AutoModel, AutoTokenizer, XLMTokenizer self.auto_embeddings = AutoModel.from_pretrained(auto_path) self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path) if isinstance(self.auto_tokenizer, XLMTokenizer): self.auto_tokenizer.do_lowercase_and_remove_accent = False for param in self.auto_embeddings.parameters(): param.requires_grad = False self._is_fixed = True self._output_dim = self.auto_embeddings.config.hidden_size self._begin_special_token_count = self.get_begin_special_token_count() self._padding_id = self.auto_tokenizer.pad_token_id
def load_model_tokenizer(self, pretrained): """ Load transformer model and tokenizer for given pre-trained name :param pretrained: pre-trained name :return: model, tokenizer """ model = None tokenizer = None if self.method == "T5": if pretrained in T5_PRETRAINED_MODELS: model = T5ForConditionalGeneration.from_pretrained(pretrained) tokenizer = T5Tokenizer.from_pretrained(pretrained) elif self.method == "BART": if pretrained in BART_PRETRAINED_MODELS: model = BartForConditionalGeneration.from_pretrained(pretrained) tokenizer = BartTokenizer.from_pretrained(pretrained) elif self.method == "GPT-2": if pretrained in GPT2_PRETRAINED_MODELS: model = GPT2LMHeadModel.from_pretrained(pretrained) model.config.max_length = self.max_length tokenizer = GPT2Tokenizer.from_pretrained(pretrained) elif self.method == "XLM": if pretrained in XLM_PRETRAINED_MODELS: model = XLMWithLMHeadModel.from_pretrained(pretrained) model.config.max_length = self.max_length tokenizer = XLMTokenizer.from_pretrained(pretrained) else: pass return model, tokenizer
def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def get_attentions(): model_name = request.args.get('model') source = request.args.get('source') target = request.args.get('target') if model_name == 'XLM': model_version = 'xlm-mlm-ende-1024' model = XLMModel.from_pretrained(model_version, output_attentions=True) tokenizer = XLMTokenizer.from_pretrained(model_version) elif model_name == 'GPT-2': model_version = 'gpt2' model = GPT2Model.from_pretrained(model_version, output_attentions=True) tokenizer = GPT2Tokenizer.from_pretrained(model_version) else: # BERT model_version = 'bert-base-uncased' model = BertModel.from_pretrained(model_version, output_attentions=True) tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True) inputs = tokenizer.encode_plus(source, target, return_tensors='pt', add_special_tokens=True) token_type_ids = inputs['token_type_ids'] input_ids = inputs['input_ids'] attention = model(input_ids, token_type_ids=token_type_ids)[-1] input_id_list = input_ids[0].tolist() tokens = tokenizer.convert_ids_to_tokens(input_id_list) return {'attention': format_attention(attention)[0].tolist(), 'source': tokens, 'target': tokens}
def __init__( self, pretrained_embedding=None, architecture_function=None, text_input_column="clean_text", meta_input_list=("extension", "dayofweek", "hour", "min"), vocab_size=25000, seq_size=100, embedding_dim=200, loss="categorical_crossentropy", activation="softmax", batch_size=4096, n_epochs=15, bert_tokenizer="jplu/tf-camembert-base", bert_model="jplu/tf-camembert-base", **kwargs, ): self.architecture_function = architecture_function self.pretrained_embedding = pretrained_embedding if self.architecture_function.__name__ != "bert_model": self.tokenizer = Tokenizer(input_column=text_input_column) elif "camembert" in bert_tokenizer.lower(): # Prevent the HuggingFace dependency try: from transformers import CamembertTokenizer self.tokenizer = CamembertTokenizer.from_pretrained( bert_tokenizer) except ModuleNotFoundError: raise ( """Please install transformers 3.4.0 (only version currently supported) pip install melusine[transformers]""") elif "flaubert" in bert_tokenizer.lower(): # Prevent the HuggingFace dependency try: from transformers import XLMTokenizer self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer) except ModuleNotFoundError: raise ( """Please install transformers 3.4.0 (only version currently supported) pip install melusine[transformers]""") else: raise NotImplementedError( "Bert tokenizer {} not implemented".format(bert_tokenizer)) self.text_input_column = text_input_column self.meta_input_list = meta_input_list self.vocab_size = vocab_size self.seq_size = seq_size self.embedding_dim = embedding_dim self.loss = loss self.activation = activation self.batch_size = batch_size self.n_epochs = n_epochs self.bert_model = bert_model self.nb_labels = 0 self.nb_meta_features = 0 self.vocabulary = [] self.vocabulary_dict = {}
def _test_TFXLM(self, size, large=False): from transformers import TFXLMModel, XLMTokenizer tokenizer = XLMTokenizer.from_pretrained(size) model = TFXLMModel.from_pretrained(size) input_dict = tokenizer("Hello, my dog is cute", return_tensors="tf") spec, input_dict = self.spec_and_pad(input_dict) outputs = ["last_hidden_state"] self.run_test(model, input_dict, input_signature=spec, outputs=outputs, large=large, atol=0.005)
def test_model(modelname): model, log = XLMModel.from_pretrained(modelname, output_loading_info=True) tokenizer = XLMTokenizer.from_pretrained(modelname, do_lower_case=False) # this line is important: by default, XLMTokenizer removes diacritics, even with do_lower_case=False flag tokenizer.do_lowercase_and_remove_accent = False print("Dictionary values must be empty lists:") print(log)
def test_space_tokenization_and_xlm_uncased_tokenization_normalization(): text = "Jeff Immelt chose to focus on the incomprehensibility of accounting rules ." space_tokenized = text.split(" ") tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") target_tokenized = tokenizer.tokenize(text) normed_space_tokenized, normed_target_tokenized = tn.normalize_tokenizations( space_tokenized, target_tokenized, tokenizer) assert "".join(normed_space_tokenized) == "".join(normed_target_tokenized)
def create_tokenizer(bert_pretrained): """ Wrapper function returning a tokenizer for BERT. """ if bert_pretrained.startswith("xlm"): return XLMTokenizer.from_pretrained(bert_pretrained) else: return BertTokenizer.from_pretrained(bert_pretrained)
def get_model_and_tokenizer(model_name, device, random_weights=False): model_name = model_name if model_name.startswith('xlnet'): model = XLNetModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = XLNetTokenizer.from_pretrained(model_name) sep = u'▁' emb_dim = 1024 if "large" in model_name else 768 elif model_name.startswith('gpt2'): model = GPT2Model.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained(model_name) sep = 'Ġ' sizes = { "gpt2": 768, "gpt2-medium": 1024, "gpt2-large": 1280, "gpt2-xl": 1600 } emb_dim = sizes[model_name] elif model_name.startswith('xlm'): model = XLMModel.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = XLMTokenizer.from_pretrained(model_name) sep = '</w>' elif model_name.startswith('bert'): model = BertModel.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = BertTokenizer.from_pretrained(model_name) sep = '##' emb_dim = 1024 if "large" in model_name else 768 elif model_name.startswith('distilbert'): model = DistilBertModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = DistilBertTokenizer.from_pretrained(model_name) sep = '##' emb_dim = 768 elif model_name.startswith('roberta'): model = RobertaModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = 'Ġ' emb_dim = 1024 if "large" in model_name else 768 else: print('Unrecognized model name:', model_name) sys.exit() if random_weights: print('Randomizing weights') model.init_weights() return model, tokenizer, sep, emb_dim
def init(args): BERTTool.multi_bert = XLMModel.from_pretrained( args.multi_bert.location) BERTTool.multi_tokener = XLMTokenizer.from_pretrained( args.multi_bert.location) BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids( ["<pad>"])[0] BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids( ["</s>"])[0] BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids( ["<s>"])[0]
def __init__(self): self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu" ) self.tokenizer = XLMTokenizer.from_pretrained( 'allegro/herbert-klej-cased-tokenizer-v1' ) self.model = RobertaModel.from_pretrained( 'allegro/herbert-klej-cased-v1' ) self.model = self.model.to(self.device)
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name log.info('In add_transformers_vocab') log.info(tokenizer_name) if tokenizer_name.startswith( "bert-" ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name: tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith( "roberta-"): # or 'roberta' in tokenizer_name: tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name: tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-roberta"): tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def test_TFXLMForQuestionAnsweringSimple(self): from transformers import XLMTokenizer, TFXLMForQuestionAnsweringSimple pretrained_weights = 'xlm-mlm-enfr-1024' tokenizer = XLMTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFXLMForQuestionAnsweringSimple.from_pretrained( pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def get_model_and_tokenizer(model_name, device="cpu", random_weights=False, model_path=None): """ model_path: if given, initialize from path instead of official repo """ init_model = model_name if model_path: print("Initializing model from local path:", model_path) init_model = model_path if model_name.startswith("xlnet"): model = XLNetModel.from_pretrained( init_model, output_hidden_states=True).to(device) tokenizer = XLNetTokenizer.from_pretrained(init_model) sep = u"▁" elif model_name.startswith("gpt2"): model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained(init_model) sep = "Ġ" elif model_name.startswith("xlm"): model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = XLMTokenizer.from_pretrained(init_model) sep = "</w>" elif model_name.startswith("bert"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("distilbert"): model = DistilBertModel.from_pretrained( init_model, output_hidden_states=True).to(device) tokenizer = DistilBertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("roberta"): model = RobertaModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = "Ġ" else: print("Unrecognized model name:", model_name) sys.exit() if random_weights: print("Randomizing weights") model.init_weights() return model, tokenizer, sep
def __init__(self, model_type): """Constructor :param model_type: which model is used, xlm or mbert """ if model_type == 'xlm': self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') model = XLMModel.from_pretrained('xlm-mlm-100-1280') self.embeddings = model.embeddings.weight elif model_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') model = BertModel.from_pretrained('bert-base-multilingual-uncased') self.embeddings = model.embeddings.word_embeddings.weight self.emb_dim = self.embeddings.shape[1]
def test_TFXLMWithLMHeadModel(self): from transformers import XLMTokenizer, TFXLMWithLMHeadModel pretrained_weights = 'xlm-mlm-enfr-1024' tokenizer = XLMTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFXLMWithLMHeadModel.from_pretrained(pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files, rtol=1.e-2, atol=1.e-4))
def __init__(self, model_type): """Constructor :param model_type: if and xlm or bert model is used """ # Instantiate model and tokenizers from pre-trained multilingual versions if model_type == 'xlm': self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') self.model = XLMModel.from_pretrained('xlm-mlm-100-1280', output_hidden_states=True) elif model_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-uncased') self.model = BertModel.from_pretrained( 'bert-base-multilingual-uncased', output_hidden_states=True) else: raise ValueError( 'Unrecognized model type. Only bert and xlm supported')
def get_embedding_for_text(text: str) -> (torch.tensor, torch.tensor): """ For a given sentence the function return embedding generated by BERT :param text: Sentence for which u want to get an embedding :return: (tensor of embeddings for each token in sentnece, average embedding of a sentence) """ tokenizer = XLMTokenizer.from_pretrained( join(dirname(realpath(__file__)), "models", "tokenizer")) bert_model = RobertaModel.from_pretrained( join(dirname(realpath(__file__)), "models", "bert")) encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE) outputs = bert_model(encoded_input) sequence_tokens_embedding = outputs[0].squeeze(dim=0) sentence_embedding = outputs[1].squeeze(dim=0) return sequence_tokens_embedding, sentence_embedding
def convert_id_to_token(indexed_tokens, model_name): if model_name == "bert": tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") elif model_name == "xlnet": tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased") elif model_name == "xlm": tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048") elif model_name == "electra": tokenizer = ElectraTokenizer.from_pretrained( "google/electra-small-discriminator") elif model_name == "albert": tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") word_tokens = [ tokenizer.convert_ids_to_tokens(indexed_token) for indexed_token in indexed_tokens ] return word_tokens
def __init__( self, pretrained_embedding=None, architecture_function=None, text_input_column="clean_text", meta_input_list=["extension", "dayofweek", "hour", "min"], vocab_size=25000, seq_size=100, embedding_dim=200, loss="categorical_crossentropy", activation="softmax", batch_size=4096, n_epochs=15, bert_tokenizer="jplu/tf-camembert-base", bert_model="jplu/tf-camembert-base", **kwargs, ): self.architecture_function = architecture_function self.pretrained_embedding = pretrained_embedding if self.architecture_function.__name__ != "bert_model": self.tokenizer = Tokenizer(input_column=text_input_column) elif "camembert" in bert_tokenizer.lower(): self.tokenizer = CamembertTokenizer.from_pretrained(bert_tokenizer) elif "flaubert" in bert_tokenizer.lower(): self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer) else: raise NotImplementedError( "Bert tokenizer {} not implemented".format(bert_tokenizer)) self.text_input_column = text_input_column self.meta_input_list = meta_input_list self.vocab_size = vocab_size self.seq_size = seq_size self.embedding_dim = embedding_dim self.loss = loss self.activation = activation self.batch_size = batch_size self.n_epochs = n_epochs self.bert_model = bert_model self.nb_labels = 0 self.nb_meta_features = 0 self.vocabulary = [] self.vocabulary_dict = {}
def __init__(self, from_pretrained=None, tokenizer="allegro/herbert-klej-cased-tokenizer-v1", embed_model="allegro/herbert-klej-cased-v1"): super().__init__() self.tokenizer = XLMTokenizer.from_pretrained(tokenizer) self.embed_model = RobertaModel.from_pretrained(embed_model, return_dict=True) self.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(768, 256), nn.LeakyReLU(), nn.Linear(256, 16), nn.LeakyReLU(), nn.Linear(16, 1), nn.Tanh()) if from_pretrained is not None: f = io.BytesIO( importlib.resources.read_binary(trained_models, f'{from_pretrained}.pth')) self.fc.load_state_dict(torch.load(f)) self.eval()
def xlm_convert_to_huggingface(args): """ Given a FaceBook's XLM model checkpoint, a BPE merges file, create and save a HuggingFace XLMTokenizer and a XLMModel. """ xlm_pth = torch.load(args.checkpoint, map_location=torch.device('cpu')) with NamedTemporaryFile() as tfile: tfile.write(b'{}') tfile.flush() tokenizer = XLMTokenizer( tfile.name, args.merges, do_lowercase_and_remove_accent=False) tokenizer.encoder = convert_vocab(xlm_pth['dico_word2id']) vocab_size = len(tokenizer) params = xlm_pth['params'] xlm_config = XLMConfig( emb_dim=params['emb_dim'], vocab_size=params['n_words'], n_layers=params['n_layers'], n_heads=params['n_heads'], n_langs=params['n_langs'], sinusoidal_embeddings=params['sinusoidal_embeddings'], use_lang_emb=params['use_lang_emb'], is_encoder=params['encoder_only'], output_hidden_states=True, n_words = params['n_words'], ) # Provide both config and state dict to model init model = XLMModel.from_pretrained( None, config=xlm_config, state_dict=xlm_pth['model']) # Save save_directory = Path(args.output_dir) if not save_directory.exists(): save_directory.mkdir(parents=True, exist_ok=True) model.save_pretrained(str(save_directory)) tokenizer.save_pretrained(str(save_directory)) tokenizer.save_vocabulary(str(save_directory))
def get_embedding_for_list_of_texts( list_of_texts: List[str], ) -> (torch.tensor, torch.tensor): """ For a given list of sentences the function return embedding generated by BERT :param text: Sentence for which u want to get an embedding :return: (tensor of embeddings for each token in sentneces, average embedding of a sentences) """ tokenizer = XLMTokenizer.from_pretrained( join(dirname(realpath(__file__)), "models", "tokenizer")) bert_model = RobertaModel.from_pretrained( join(dirname(realpath(__file__)), "models", "bert")) emote_to_text = {} with open(join(dirname(realpath(__file__)), "emote_to_text.json"), encoding='utf8') as file: emote_to_text = json.load(file) list_of_texts = starmap( _replace_emotes_with_text, zip(list_of_texts, [emote_to_text] * len(list_of_texts))) list_of_texts = map(_remove_urls_from_text, list_of_texts) list_of_sentence_embeddings = [] list_of_sequence_embeddings = [] for text in list_of_texts: encoded_input = tokenizer.encode(text, return_tensors="pt").to(DEVICE) outputs = bert_model(encoded_input) sequence_tokens_embedding = outputs[0].squeeze(dim=0) sentence_embedding = outputs[1].squeeze(dim=0) list_of_sequence_embeddings.append(sequence_tokens_embedding) list_of_sentence_embeddings.append(sentence_embedding) seq_embeddings_tensor = merge(list_of_sequence_embeddings) sentence_embeddings_tensor = torch.stack(list_of_sentence_embeddings, dim=0) return seq_embeddings_tensor, sentence_embeddings_tensor
def build_tokenizer(model, add_cap_sign, textify_emoji, segment_hashtag, preprocess): if model == 'mbert': tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') elif model =='xlm': tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') tokenizer.add_tokens(['@USER']) # All Transformers models if add_cap_sign: tokenizer.add_tokens(['<has_cap>', '<all_cap>']) if textify_emoji: tokenizer.add_tokens(['<emoji>', '</emoji>']) if segment_hashtag: tokenizer.add_tokens(['<hashtag>', '</hashtag>']) #tokenizer.add_tokens([w.strip() for w in open('../resources/log_odds.txt').readlines()]) # TODO: this is not saved when calling `save_pretrained` if preprocess is not None: tokenizer.tokenize = compose(preprocess, tokenizer.tokenize) return tokenizer
def __init__(self, device): super().__init__() self.net = nn.Sequential( nn.Dropout(0.3), nn.Linear(768, 256), nn.ReLU(), nn.Dropout(0.3), nn.Linear(256, 2), # nn.Linear(768, 2), nn.Tanh() ) self.device = device self.tokenizer = XLMTokenizer.from_pretrained( # "models/politicalBERT") # "models/politicalHerBERT") "allegro/herbert-klej-cased-tokenizer-v1") self.model = RobertaModel.from_pretrained( # "models/politicalBERT", "models/politicalHerBERT", # "allegro/herbert-klej-cased-v1", return_dict=True)
rw_vocab = get_vocab(filename, 10000) filename2 = "SUBTLEX-US frequency list with PoS information text version.txt" pos_dict = get_pos_dict(filename2) GPT2 = ModelInfo(GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True), GPT2Tokenizer.from_pretrained('gpt2'), "Ġ", vocab, "GTP2") Roberta = ModelInfo( RobertaForCausalLM.from_pretrained('roberta-base', return_dict=True), RobertaTokenizer.from_pretrained('roberta-base'), "_", vocab, "Roberta") XLM = ModelInfo( XLMWithLMHeadModel.from_pretrained('xlm-mlm-xnli15-1024', return_dict=True), XLMTokenizer.from_pretrained('xlm-mlm-xnli15-1024'), "_", vocab, "XLM") T5 = ModelInfo( T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True), T5Tokenizer.from_pretrained("t5-base"), "_", vocab, "T5") Albert = ModelInfo( AlbertForMaskedLM.from_pretrained('albert-base-v2', return_dict=True), AlbertTokenizer.from_pretrained('albert-base-v2'), "_", vocab, "Albert") TXL = ModelInfo(TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103'), TransfoXLTokenizer.from_pretrained('transfo-xl-wt103'), "_", vocab, "TXL") if __name__ == "__main__":
def main(): parser = argparse.ArgumentParser() # parser.add_argument('--dataset', default='txt', type=str, help='txt -> self-customized') # parser.add_argument('--src_lang', default='en', type=str, help='') # parser.add_argument('--tgt_lang', default='zh', type=str, help='') parser.add_argument( '--max_len_en', default=25, type=int, help='maximum length of English in **bilingual** corpus') parser.add_argument( '--max_len_zh', default=25, type=int, help='maximum length of Chinese in **bilingual** corpus') parser.add_argument("--src_file", default='./.pkl', type=str, help="The input data file name.") # General parser.add_argument("--config_path", default=None, type=str, help="Bert config file path.") parser.add_argument( "--bert_model", default="bert-base-cased", type=str, help= "Bert pre-trained model selected in the list: bert-base-cased, bert-large-cased." ) parser.add_argument("--xml_vocab", type=str, default='./download_models/xml_vocab.json') parser.add_argument("--xml_merge", type=str, default='./download_models/xml_merges.txt') parser.add_argument("--model_recover_path", default=None, type=str, help="The file of fine-tuned pretraining model.") parser.add_argument('--max_position_embeddings', type=int, default=512, help="max position embeddings") # For decoding #parser.add_argument('--fp16', action='store_true', # help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--amp', action='store_true', help="Whether to use amp for fp16") parser.add_argument('--seed', type=int, default=123, help="random seed for initialization") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument('--new_segment_ids', action='store_true', help="Use new segment ids for bi-uni-directional LM.") parser.add_argument('--batch_size', type=int, default=4, help="Batch size for decoding.") parser.add_argument('--beam_size', type=int, default=1, help="Beam size for searching") parser.add_argument('--length_penalty', type=float, default=0, help="Length penalty for beam search") parser.add_argument('--forbid_duplicate_ngrams', action='store_true') parser.add_argument('--forbid_ignore_word', type=str, default=None, help="Forbid the word during forbid_duplicate_ngrams") parser.add_argument("--min_len", default=None, type=int) parser.add_argument('--ngram_size', type=int, default=3) parser.add_argument('--drop_prob', default=0.1, type=float) parser.add_argument('--enable_butd', action='store_true', help='set to take in region features') parser.add_argument('--output_dir', default='./result', type=str) #useless parser.add_argument('--split', type=str, default='val') #wmt? parser.add_argument('--len_vis_input', type=int, default=1, help="The length of visual token input region 1") with open( '/data/private/chenyutong/dataset/concept_count/word_concept_count.pkl', 'rb') as f: word_fre = pickle.load(f) word_fre = defaultdict(int, word_fre) args = parser.parse_args() assert args.batch_size == 1, 'only support batch_size=1' args.max_tgt_length = max(args.max_len_en, args.max_len_zh) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() # fix random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) tokenizer_en = BertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case, cache_dir=args.output_dir + '/.pretrained_model') if args.max_position_embeddings: tokenizer_en.max_len = args.max_position_embeddings #tokenizer_en= WhitespaceTokenizer() if args.tokenized_input else tokenizer_en tokenizer_zh = XLMTokenizer(args.xml_vocab, args.xml_merge) tokenizer_zh.tokenize = lambda x: tokenizer_zh._tokenize( x, lang='zh', bypass_tokenizer=False) with open(args.xml_vocab, 'r') as f: tokenizer_zh.vocab = json.load(f) indexer = Indexer( [os.path.join(args.bert_model, 'vocab.txt'), args.xml_vocab]) with open('full_vocab.json', 'w') as f: json.dump(indexer.ids_to_tokens, f) tokenizers = {'en': tokenizer_en, 'zh': tokenizer_zh} print('tokenizer created') assert '.pkl' in args.src_file with open(args.src_file, 'rb') as f: src_data = pickle.load(f) # list [pred_id, vocab, vis, pos, distribution] # dict {'vgid':{'en':,'zh':,'region_features':[img, conf, fea[i], pos[i],dist]}} amp_handle = None if args.amp: from apex import amp # Prepare model cls_num_labels = 2 type_vocab_size = 12 if args.new_segment_ids else 12 mask_word_id, eos_word_ids = indexer(["[MASK]", "[SEP]"]) forbid_ignore_set = None #default None relax_projection, task_idx_proj = 0, 3 if args.forbid_ignore_word: w_list = [] for w in args.forbid_ignore_word.split('|'): if w.startswith('[') and w.endswith(']'): w_list.append(w.upper()) else: w_list.append(w) forbid_ignore_set = set(indexer(w_list)) print(args.model_recover_path) for model_recover_path in glob.glob(args.model_recover_path.strip()): #logger.info("***** Recover model: %s *****", model_recover_path) model_recover = torch.load(model_recover_path) model = BertForSeq2SeqDecoder.from_pretrained( args.bert_model, max_position_embeddings=args.max_position_embeddings, config_path=args.config_path, state_dict=model_recover, num_labels=cls_num_labels, vocab_size=len(indexer), type_vocab_size=type_vocab_size, task_idx=3, mask_word_id=mask_word_id, #img2txt search_beam_size=args.beam_size, length_penalty=args.length_penalty, eos_id=eos_word_ids, forbid_duplicate_ngrams=args.forbid_duplicate_ngrams, forbid_ignore_set=forbid_ignore_set, ngram_size=args.ngram_size, min_len=args.min_len, enable_butd=args.enable_butd, len_vis_input=args.len_vis_input) del model_recover model.to(device) if args.amp: model = amp.initialize(model, opt_level='O2') #'02') torch.cuda.empty_cache() model.eval() fout = open(os.path.join(args.output_dir, 'region2txt_output.txt'), 'w') output_lines = [] select_ids = [87, 120, 179, 297, 721, 852, 1025] for step_val, sd in enumerate(src_data.items()): # if step_val>=1: # break vgid, input_item = sd en, zh = input_item['en'], input_item['zh'] fout.writelines('\n' + '#' * 10 + '\n') fout.writelines('{}\n'.format(vgid)) fout.writelines('{} coco: word_fre {} vis_fre {} \n'.format( en, input_item['coco_fre']['word'], input_item['coco_fre']['vis'])) fout.writelines('{} aic: word_fre {} vis_fre {} \n'.format( zh, input_item['aic_fre']['word'], input_item['aic_fre']['vis'])) print('step_val {} Process {}'.format(step_val, en)) for rf in tqdm(input_item['region_features']): filename, conf, vis_feats, vis_pe, cls_label = rf vis_feats = torch.from_numpy(vis_feats).to(device) vis_feats = vis_feats.unsqueeze(0) vis_pe = torch.from_numpy(vis_pe).to(device) vis_pe = vis_pe.unsqueeze(0) cls_label = torch.from_numpy(cls_label).to(device) cls_label = cls_label.unsqueeze(0) # # lazy normalization of the coordinates... copy from seq2seq w_est = torch.max(vis_pe[:, [0, 2]]) * 1. + 1e-5 h_est = torch.max(vis_pe[:, [1, 3]]) * 1. + 1e-5 vis_pe[:, [0, 2]] /= w_est vis_pe[:, [1, 3]] /= h_est assert h_est > 0, 'should greater than 0! {}'.format(h_est) assert w_est > 0, 'should greater than 0! {}'.format(w_est) rel_area = (vis_pe[:, 3] - vis_pe[:, 1]) * (vis_pe[:, 2] - vis_pe[:, 0]) rel_area.clamp_(0) vis_pe = torch.cat( (vis_pe[:, :4], rel_area.view(-1, 1), vis_pe[:, 5:]), -1) # confident score normalized_coord = F.normalize(vis_pe.data[:, :5] - 0.5, dim=-1) vis_pe = torch.cat((F.layer_norm(vis_pe, [6]), \ F.layer_norm(cls_label, [1601])), dim=-1) # 1601 hard coded... #BL,H vis_feats = vis_feats.unsqueeze(0) vis_pe = vis_pe.unsqueeze(0) #print('input shape', vis_feats.shape, vis_pe.shape) assert args.new_segment_ids == False, 'only support 0 1 6 now' tokens = ['[CLS]', '[UNK]', '[SEP]'] input_ids = indexer(tokens) input_ids = np.expand_dims(np.array(input_ids), axis=0) input_ids = torch.tensor(input_ids, dtype=torch.long, device=device) max_len_in_batch = len(tokens) + args.max_tgt_length _tril_matrix = torch.tril( torch.ones((max_len_in_batch, max_len_in_batch), dtype=torch.long)) input_mask = torch.zeros(max_len_in_batch, max_len_in_batch, dtype=torch.long, device=device) input_mask[:, :len(tokens)].fill_(1) second_st, second_end = len(tokens), max_len_in_batch input_mask[second_st:second_end, second_st:second_end].copy_( _tril_matrix[:second_end - second_st, :second_end - second_st]) #L,L input_mask = input_mask.unsqueeze(0) position_ids = torch.arange(max_len_in_batch, dtype=torch.long, device=device) #L position_ids = position_ids.unsqueeze(0) # B,L predictions = { 'en': None, 'zh': None, 'en2zh': None, 'zh2en': None } for tgt_lang, lang_id in zip(['en', 'zh'], [1, 6]): token_type_ids = [0] * len( tokens) + [lang_id] * args.max_tgt_length token_type_ids = np.expand_dims(np.array(token_type_ids), axis=0) token_type_ids = torch.tensor(token_type_ids, dtype=torch.long, device=device) with torch.no_grad(): # print(token_type_ids[0]) # print(position_ids[0]) # print(input_ids[0]) # print(input_mask[0]) # input() traces = model( vis_feats=vis_feats, vis_pe=vis_pe, input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=input_mask, search_beam_size=args.beam_size, task_idx=3, mode='img2txt', sample_mode='greedy') #validation greedy output_sequence = postprocess(traces, args.beam_size, tgt_lang, indexer) predictions[tgt_lang] = output_sequence #truncate for langs, lang_ids in zip(['en2zh', 'zh2en'], [[1, 6], [6, 1]]): src_lang = langs[:2] #en,zh tgt_lang = langs[-2:] w = predictions[ src_lang] # predictions['en']/ predictions['zh'] w_t = tokenizers[src_lang].tokenize(w) tokens = ['[CLS]'] + w_t + ['[SEP]'] input_ids = indexer(tokens) token_type_ids = [lang_ids[0]] * len( input_ids) + [lang_ids[1]] * args.max_tgt_length input_ids = np.expand_dims(np.array(input_ids), axis=0) token_type_ids = np.expand_dims(np.array(token_type_ids), axis=0) input_ids = torch.tensor(input_ids, dtype=torch.long, device=device) token_type_ids = torch.tensor(token_type_ids, dtype=torch.long, device=device) max_len_in_batch = len( tokens) + args.max_tgt_length #2+64 = 66 position_ids = torch.arange(max_len_in_batch, dtype=torch.long, device=device) #L position_ids = position_ids.unsqueeze(0) # B,L _tril_matrix = torch.tril( torch.ones((max_len_in_batch, max_len_in_batch), dtype=torch.long)) input_mask = torch.zeros(max_len_in_batch, max_len_in_batch, dtype=torch.long, device=device) input_mask[:, :len(tokens)].fill_(1) second_st, second_end = len(tokens), max_len_in_batch input_mask[second_st:second_end, second_st:second_end].copy_( _tril_matrix[:second_end - second_st, :second_end - second_st]) #L,L input_mask = input_mask.unsqueeze(0) with torch.no_grad(): traces = model( vis_feats=None, vis_pe=None, input_ids=input_ids, token_type_ids=token_type_ids, position_ids=position_ids, attention_mask=input_mask, search_beam_size=args.beam_size, task_idx=3, mode='txt2txt', sample_mode='greedy') #validation greedy output_sequence = postprocess(traces, args.beam_size, tgt_lang, indexer) predictions[langs] = output_sequence #print(predictions) fout.writelines( 'conf:{:.2f} en:{: <10} fre:{:<5d} en2zh:{: <10} zh:{: <10} fre:{:<5d} zh2en:{: <10} \n' .format(conf, predictions['en'], word_fre['coco'][predictions['en']], predictions['en2zh'], predictions['zh'], word_fre['aic'][predictions['zh']], predictions['zh2en'])) fout.close()
) tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', do_lower_case=False) model = RobertaForMaskedLM.from_pretrained('./multi-label_LM/multi-label_RoBerta_e10_b16', config=config) # 12-layer, 768-hidden, 12-heads, 125M parameters, roberta-base using the bert-base architecture elif args.LM == 'XLM': from transformers import XLMConfig, XLMTokenizer, XLMWithLMHeadModel config = XLMConfig(vocab_size=64139, emb_dim=1024, max_position_embeddings=512, n_heads=8, n_layers=6, ) tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False) model = XLMWithLMHeadModel.from_pretrained('./multi-label_LM/multi-label_XLM_e10_b16', config=config) # 6-layer, 1024-hidden, 8-heads # XLM English-French model trained on the concatenation of English and French wikipedia else: print('need to define LM from Bert,RoBerta,XLM') print(model) def freeze_layer_fun(freeze_layer): for name, param in model.named_parameters(): if freeze_layer in name: print(name) param.requires_grad = False else:
def get_model_and_tokenizer( model_name, device="cpu", random_weights=False, model_path=None ): """ model_path: if given, initialize from path instead of official repo models typically cached in ~/.cache/torch/transformers/ """ init_model = model_name if model_path: print("Initializing model from local path:", model_path) init_model = model_path if model_name.startswith("xlnet"): model = XLNetModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = XLNetTokenizer.from_pretrained(init_model) sep = u"▁" elif model_name.startswith("gpt2"): model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = GPT2Tokenizer.from_pretrained(init_model) sep = "Ġ" elif model_name.startswith("xlm"): model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = XLMTokenizer.from_pretrained(init_model) sep = "</w>" elif model_name.startswith("bert"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define QARiB https://huggingface.co/qarib/bert-base-qarib elif model_name.startswith("qarib"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define AraBERT https://huggingface.co/aubmindlab/bert-base-arabert elif model_name.startswith("aubmindlab"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define ArabicBERT https://huggingface.co/asafaya/bert-base-arabic elif model_name.startswith("asafaya"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" #Define https://huggingface.co/UBC-NLP/MARBERT elif model_name.startswith("UBC-NLP"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("bert-base-multilingual"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("distilbert"): model = DistilBertModel.from_pretrained( init_model, output_hidden_states=True ).to(device) tokenizer = DistilBertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("roberta"): model = RobertaModel.from_pretrained(model_name, output_hidden_states=True).to( device ) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = "Ġ" else: print("Unrecognized model name:", model_name) sys.exit() if random_weights: print("Randomizing weights") model.init_weights() return model, tokenizer, sep