def __init__(self, auto_model: str, auto_path: str): super().__init__() if "camembert" in auto_model: from transformers import CamembertModel, CamembertTokenizer self.auto_embeddings = CamembertModel.from_pretrained(auto_path) self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path) elif "flaubert" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) self.auto_tokenizer.do_lowercase_and_remove_accent = False elif "xlm" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) elif "bert" in auto_model: from transformers import BertModel, BertTokenizer self.auto_embeddings = BertModel.from_pretrained(auto_path) self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path) else: from transformers import AutoModel, AutoTokenizer, XLMTokenizer self.auto_embeddings = AutoModel.from_pretrained(auto_path) self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path) if isinstance(self.auto_tokenizer, XLMTokenizer): self.auto_tokenizer.do_lowercase_and_remove_accent = False for param in self.auto_embeddings.parameters(): param.requires_grad = False self._is_fixed = True self._output_dim = self.auto_embeddings.config.hidden_size self._begin_special_token_count = self.get_begin_special_token_count() self._padding_id = self.auto_tokenizer.pad_token_id
def get_attentions(): model_name = request.args.get('model') source = request.args.get('source') target = request.args.get('target') if model_name == 'XLM': model_version = 'xlm-mlm-ende-1024' model = XLMModel.from_pretrained(model_version, output_attentions=True) tokenizer = XLMTokenizer.from_pretrained(model_version) elif model_name == 'GPT-2': model_version = 'gpt2' model = GPT2Model.from_pretrained(model_version, output_attentions=True) tokenizer = GPT2Tokenizer.from_pretrained(model_version) else: # BERT model_version = 'bert-base-uncased' model = BertModel.from_pretrained(model_version, output_attentions=True) tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True) inputs = tokenizer.encode_plus(source, target, return_tensors='pt', add_special_tokens=True) token_type_ids = inputs['token_type_ids'] input_ids = inputs['input_ids'] attention = model(input_ids, token_type_ids=token_type_ids)[-1] input_id_list = input_ids[0].tolist() tokens = tokenizer.convert_ids_to_tokens(input_id_list) return {'attention': format_attention(attention)[0].tolist(), 'source': tokens, 'target': tokens}
def test_model(modelname): model, log = XLMModel.from_pretrained(modelname, output_loading_info=True) tokenizer = XLMTokenizer.from_pretrained(modelname, do_lower_case=False) # this line is important: by default, XLMTokenizer removes diacritics, even with do_lower_case=False flag tokenizer.do_lowercase_and_remove_accent = False print("Dictionary values must be empty lists:") print(log)
def get_model_and_tokenizer(model_name, device, random_weights=False): model_name = model_name if model_name.startswith('xlnet'): model = XLNetModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = XLNetTokenizer.from_pretrained(model_name) sep = u'▁' emb_dim = 1024 if "large" in model_name else 768 elif model_name.startswith('gpt2'): model = GPT2Model.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained(model_name) sep = 'Ġ' sizes = { "gpt2": 768, "gpt2-medium": 1024, "gpt2-large": 1280, "gpt2-xl": 1600 } emb_dim = sizes[model_name] elif model_name.startswith('xlm'): model = XLMModel.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = XLMTokenizer.from_pretrained(model_name) sep = '</w>' elif model_name.startswith('bert'): model = BertModel.from_pretrained(model_name, output_hidden_states=True).to(device) tokenizer = BertTokenizer.from_pretrained(model_name) sep = '##' emb_dim = 1024 if "large" in model_name else 768 elif model_name.startswith('distilbert'): model = DistilBertModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = DistilBertTokenizer.from_pretrained(model_name) sep = '##' emb_dim = 768 elif model_name.startswith('roberta'): model = RobertaModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = 'Ġ' emb_dim = 1024 if "large" in model_name else 768 else: print('Unrecognized model name:', model_name) sys.exit() if random_weights: print('Randomizing weights') model.init_weights() return model, tokenizer, sep, emb_dim
def init(args): BERTTool.multi_bert = XLMModel.from_pretrained( args.multi_bert.location) BERTTool.multi_tokener = XLMTokenizer.from_pretrained( args.multi_bert.location) BERTTool.multi_pad = BERTTool.multi_tokener.convert_tokens_to_ids( ["<pad>"])[0] BERTTool.multi_sep = BERTTool.multi_tokener.convert_tokens_to_ids( ["</s>"])[0] BERTTool.multi_cls = BERTTool.multi_tokener.convert_tokens_to_ids( ["<s>"])[0]
def __init__(self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True): super().__init__(config=config) if pre_load_model: self.xlm = XLMModel.from_pretrained(self.config.model_name, output_hidden_states=True) else: xlm_config = XLMConfig.from_pretrained(self.config.model_name, output_hidden_states=True) self.xlm = XLMModel(xlm_config) self.source_lang_id = self.xlm.config.lang2id.get( self.config.source_language) self.target_lang_id = self.xlm.config.lang2id.get( self.config.target_language) if None in (self.source_lang_id, self.target_lang_id): raise ValueError( f'Invalid lang_id for XLM model.' f' Valid ids are: {self.xlm.config.lang2id.keys()}') self.mlp = None if self.config.use_mlp: self.mlp = nn.Sequential( nn.Linear(self.xlm.config.hidden_size, self.config.hidden_size), nn.Tanh(), ) output_size = self.config.hidden_size else: output_size = self.xlm.config.hidden_size self._sizes = { const.TARGET: output_size, const.TARGET_LOGITS: output_size, const.TARGET_SENTENCE: 2 * output_size, const.SOURCE: output_size, const.SOURCE_LOGITS: output_size, } self.vocabs = { const.TARGET: vocabs[const.TARGET], const.SOURCE: vocabs[const.SOURCE], } self.output_embeddings = self.xlm.embeddings if self.config.freeze: for param in self.xlm.parameters(): param.requires_grad = False
def get_model_and_tokenizer(model_name, device="cpu", random_weights=False, model_path=None): """ model_path: if given, initialize from path instead of official repo """ init_model = model_name if model_path: print("Initializing model from local path:", model_path) init_model = model_path if model_name.startswith("xlnet"): model = XLNetModel.from_pretrained( init_model, output_hidden_states=True).to(device) tokenizer = XLNetTokenizer.from_pretrained(init_model) sep = u"▁" elif model_name.startswith("gpt2"): model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = GPT2Tokenizer.from_pretrained(init_model) sep = "Ġ" elif model_name.startswith("xlm"): model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = XLMTokenizer.from_pretrained(init_model) sep = "</w>" elif model_name.startswith("bert"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to(device) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("distilbert"): model = DistilBertModel.from_pretrained( init_model, output_hidden_states=True).to(device) tokenizer = DistilBertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("roberta"): model = RobertaModel.from_pretrained( model_name, output_hidden_states=True).to(device) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = "Ġ" else: print("Unrecognized model name:", model_name) sys.exit() if random_weights: print("Randomizing weights") model.init_weights() return model, tokenizer, sep
def build_model(model, time_pooling, layer_pooling, layer, new_num_tokens, device, **kwargs): n_class = 2 if model == 'mbert': base_model = BertModel.from_pretrained('bert-base-multilingual-uncased', output_hidden_states=True, **kwargs) elif model == 'xlm': base_model = XLMModel.from_pretrained('xlm-mlm-100-1280', output_hidden_states=True, **kwargs) base_model.resize_token_embeddings(new_num_tokens) # All transformers models model = PoolClassifier(base_model, n_class, time_pooling, layer_pooling, layer) return model.to(device)
def __init__(self, model_type): """Constructor :param model_type: which model is used, xlm or mbert """ if model_type == 'xlm': self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') model = XLMModel.from_pretrained('xlm-mlm-100-1280') self.embeddings = model.embeddings.weight elif model_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased') model = BertModel.from_pretrained('bert-base-multilingual-uncased') self.embeddings = model.embeddings.word_embeddings.weight self.emb_dim = self.embeddings.shape[1]
def __init__(self, model_type): """Constructor :param model_type: if and xlm or bert model is used """ # Instantiate model and tokenizers from pre-trained multilingual versions if model_type == 'xlm': self.tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-100-1280') self.model = XLMModel.from_pretrained('xlm-mlm-100-1280', output_hidden_states=True) elif model_type == 'bert': self.tokenizer = BertTokenizer.from_pretrained( 'bert-base-multilingual-uncased') self.model = BertModel.from_pretrained( 'bert-base-multilingual-uncased', output_hidden_states=True) else: raise ValueError( 'Unrecognized model type. Only bert and xlm supported')
def xlm_convert_to_huggingface(args): """ Given a FaceBook's XLM model checkpoint, a BPE merges file, create and save a HuggingFace XLMTokenizer and a XLMModel. """ xlm_pth = torch.load(args.checkpoint, map_location=torch.device('cpu')) with NamedTemporaryFile() as tfile: tfile.write(b'{}') tfile.flush() tokenizer = XLMTokenizer( tfile.name, args.merges, do_lowercase_and_remove_accent=False) tokenizer.encoder = convert_vocab(xlm_pth['dico_word2id']) vocab_size = len(tokenizer) params = xlm_pth['params'] xlm_config = XLMConfig( emb_dim=params['emb_dim'], vocab_size=params['n_words'], n_layers=params['n_layers'], n_heads=params['n_heads'], n_langs=params['n_langs'], sinusoidal_embeddings=params['sinusoidal_embeddings'], use_lang_emb=params['use_lang_emb'], is_encoder=params['encoder_only'], output_hidden_states=True, n_words = params['n_words'], ) # Provide both config and state dict to model init model = XLMModel.from_pretrained( None, config=xlm_config, state_dict=xlm_pth['model']) # Save save_directory = Path(args.output_dir) if not save_directory.exists(): save_directory.mkdir(parents=True, exist_ok=True) model.save_pretrained(str(save_directory)) tokenizer.save_pretrained(str(save_directory)) tokenizer.save_vocabulary(str(save_directory))
def test_model_from_pretrained(self): for model_name in XLM_PRETRAINED_MODEL_ARCHIVE_LIST[:1]: model = XLMModel.from_pretrained(model_name) self.assertIsNotNone(model)
def get_model_and_tokenizer( model_name, device="cpu", random_weights=False, model_path=None ): """ model_path: if given, initialize from path instead of official repo models typically cached in ~/.cache/torch/transformers/ """ init_model = model_name if model_path: print("Initializing model from local path:", model_path) init_model = model_path if model_name.startswith("xlnet"): model = XLNetModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = XLNetTokenizer.from_pretrained(init_model) sep = u"▁" elif model_name.startswith("gpt2"): model = GPT2Model.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = GPT2Tokenizer.from_pretrained(init_model) sep = "Ġ" elif model_name.startswith("xlm"): model = XLMModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = XLMTokenizer.from_pretrained(init_model) sep = "</w>" elif model_name.startswith("bert"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define QARiB https://huggingface.co/qarib/bert-base-qarib elif model_name.startswith("qarib"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define AraBERT https://huggingface.co/aubmindlab/bert-base-arabert elif model_name.startswith("aubmindlab"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" # Define ArabicBERT https://huggingface.co/asafaya/bert-base-arabic elif model_name.startswith("asafaya"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" #Define https://huggingface.co/UBC-NLP/MARBERT elif model_name.startswith("UBC-NLP"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("bert-base-multilingual"): model = BertModel.from_pretrained(init_model, output_hidden_states=True).to( device ) tokenizer = BertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("distilbert"): model = DistilBertModel.from_pretrained( init_model, output_hidden_states=True ).to(device) tokenizer = DistilBertTokenizer.from_pretrained(init_model) sep = "##" elif model_name.startswith("roberta"): model = RobertaModel.from_pretrained(model_name, output_hidden_states=True).to( device ) tokenizer = RobertaTokenizer.from_pretrained(model_name) sep = "Ġ" else: print("Unrecognized model name:", model_name) sys.exit() if random_weights: print("Randomizing weights") model.init_weights() return model, tokenizer, sep
def test_model_from_pretrained(self): for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = XLMModel.from_pretrained(model_name) self.assertIsNotNone(model)
def test_xlm_embeddings(): xlm_model: str = "xlm-mlm-en-2048" tokenizer = XLMTokenizer.from_pretrained(xlm_model) model = XLMModel.from_pretrained(pretrained_model_name_or_path=xlm_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize("<s>" + s + "</s>") indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 # # <s> 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>', '</s> # | | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = XLMEmbeddings( pretrained_model_name_or_path=xlm_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[8].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") first_token_embedding_ref = first_layer[1].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[10].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[1], first_layer[1]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[8], first_layer[10]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding([first_layer[1] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[8], first_layer[9], first_layer[10]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * model.embeddings.embedding_dim actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * model.embeddings.embedding_dim actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
def test_model_from_pretrained(self): cache_dir = "/tmp/transformers_test/" for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model)
def generate_embedding(objectives, model_name, batch_size=100, output_attention=False): """ Takes in a pandas dataframe and generates embeddings for the text column using the hugging face implemented models - Inputs: pd_dataframe (pandas dataframe): The dataframe containing all text column and their ids model_name (str): name of the model to be used for generating embeddings batch_size (int): batch size to use when generating embeddings for sentences - Output: sentence_embedding (tensor): tensor of shape n by 1024 where n is the number of sentence """ if model_name == "bert": # Load pre-trained bert model (weights) model = BertModel.from_pretrained("bert-base-uncased", output_attentions=output_attention) elif model_name == "xlnet": # Load pre-trained xlnet model (weights) model = XLNetModel.from_pretrained("xlnet-base-cased", output_attentions=output_attention) elif model_name == "xlm": # Load pre-trained xlm model (weights) model = XLMModel.from_pretrained("xlm-mlm-en-2048", output_attentions=output_attention) elif model_name == "electra": # Load pre-trained electra model (weights) model = ElectraModel.from_pretrained( "google/electra-small-discriminator", output_attentions=output_attention) elif model_name == "albert": # Load pre-trained albert model (weights) model = AlbertForMaskedLM.from_pretrained( "albert-base-v2", output_attentions=output_attention) else: print( "Please select an implemented model name. {} doesn't exist".format( model_name)) return sentences_per_batch = batch_size # setting up the device if torch.cuda.is_available(): dev = "cuda:0" else: dev = "cpu" device = torch.device(dev) print("using ", device) # Put the model in "evaluation" mode, meaning feed-forward operation. model.eval() model.to(device) num_sentences = len(objectives) sentence_embedding = [] attention_layers = None if num_sentences > sentences_per_batch: num_batches = num_sentences // sentences_per_batch for i in range(num_batches): start = i * sentences_per_batch end = (i + 1) * sentences_per_batch if i == num_batches - 1: end = num_sentences mini_objective = list(objectives[start:end]) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([mini_objective]).squeeze() tokens_tensor = tokens_tensor.to(device) # Predict hidden states features for each layer with torch.no_grad(): encoded_layers = model(tokens_tensor) # taking embeddings of the last layer. # token_vecs` is a tensor with shape [n x k x 1024] token_vecs = encoded_layers[0] # take the vector corresponing to the [CLS] token if it has a cls token. if model_name in ["bert", "albert", "electra"]: sentence_embedding += token_vecs[:, 0, :].tolist() # for those without a cls token, Calculate the average of all k token vectors and adding to the main list else: sentence_embedding += torch.mean(token_vecs, dim=1).tolist() if output_attention is True: attention_layer = [al.tolist() for al in encoded_layers[-1]] attention_layer = np.array(attention_layer) if len(attention_layers) == 0: attention_layers = attention_layer else: attention_layers = np.concatenate( (attention_layers, attention_layer), axis=1) print("Embedding for batch {} out of {} batches Completed.".format( i, num_batches)) else: # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([objectives]).squeeze() tokens_tensor = tokens_tensor.to(device) # Predict hidden states features for each layer with torch.no_grad(): encoded_layers = model(tokens_tensor) # taking embeddings of the last layer. # token_vecs` is a tensor with shape [n x k x 1024] token_vecs = encoded_layers[0] # take the vector corresponing to the [CLS] token if it has a cls token. if model_name in ["bert", "albert", "electra"]: sentence_embedding = token_vecs[:, 0, :].tolist() # for those without a cls token, Calculate the average of all k token vectors and adding to the main list else: sentence_embedding = torch.mean(token_vecs, dim=1).tolist() if output_attention is True: attention_layers = [al.tolist() for al in encoded_layers[-1]] attention_layers = np.array(attention_layers) print( "Our final sentence embedding vector of shape:", len(sentence_embedding), len(sentence_embedding[0]), ) if output_attention: print("And the corresponding attention vector of shape:", attention_layers.shape) return sentence_embedding, attention_layers
def get_embedding(type_embedding, data): if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': data = [x.lower() for x in data] path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True) # Set the device to GPU (cuda) if available, otherwise stick with CPU device = 'cuda' if torch.cuda.is_available() else 'cpu' list_of_four_last_embeddings = [] list_of_mean = [] for l in data: # Convert the string "granola bars" to tokenized vocabulary IDs input_ids = tokenizer.encode(l) #print(input_ids) # Convert the list of IDs to a tensor of IDs input_ids = torch.LongTensor(input_ids) #print(input_ids) model = model.to(device) input_ids = input_ids.to(device) #print(input_ids) model.eval() # unsqueeze IDs to get batch size of 1 as added dimension input_ids = input_ids.unsqueeze(0) with torch.no_grad(): out = model(input_ids=input_ids) # we only want the hidden_states if type_embedding == 'xlm': hidden_states = out[1] else: hidden_states = out[2] #mean of layers sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze() list_of_mean.append(sentence_embedding.tolist()) # get last four layers last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)] # cast layers to a tuple and concatenate over the last dimension cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1) # take the mean of the concatenated vector over the token dimension cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze() list_of_four_last_embeddings.append(cat_sentence_embedding.tolist()) #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape) #print('list of mean', np.array(list_of_mean).shape) return list_of_mean, list_of_four_last_embeddings
def tokenizer_and_model(type_embedding): ######### #PORTUGUESE ######### if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) special_tokens_dict = { 'additional_special_tokens': ['[USER]', '[SYSTEM]'] } orig_num_tokens = len(tokenizer) num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict) total_num_tokens = orig_num_tokens + num_added_tokens model.resize_token_embeddings(total_num_tokens) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True, return_dict=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True, return_dict=True) return tokenizer, model