def get_embedding(type_embedding, data): if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfranco/Movile_project/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': data = [x.lower() for x in data] path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True) # Set the device to GPU (cuda) if available, otherwise stick with CPU device = 'cuda' if torch.cuda.is_available() else 'cpu' list_of_four_last_embeddings = [] list_of_mean = [] for l in data: # Convert the string "granola bars" to tokenized vocabulary IDs input_ids = tokenizer.encode(l) #print(input_ids) # Convert the list of IDs to a tensor of IDs input_ids = torch.LongTensor(input_ids) #print(input_ids) model = model.to(device) input_ids = input_ids.to(device) #print(input_ids) model.eval() # unsqueeze IDs to get batch size of 1 as added dimension input_ids = input_ids.unsqueeze(0) with torch.no_grad(): out = model(input_ids=input_ids) # we only want the hidden_states if type_embedding == 'xlm': hidden_states = out[1] else: hidden_states = out[2] #mean of layers sentence_embedding = torch.mean(hidden_states[-1], dim=1).squeeze() list_of_mean.append(sentence_embedding.tolist()) # get last four layers last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)] # cast layers to a tuple and concatenate over the last dimension cat_hidden_states = torch.cat(tuple(last_four_layers), dim=-1) # take the mean of the concatenated vector over the token dimension cat_sentence_embedding = torch.mean(cat_hidden_states, dim=1).squeeze() list_of_four_last_embeddings.append(cat_sentence_embedding.tolist()) #print('list of four last embeddings', np.array(list_of_four_last_embeddings).shape) #print('list of mean', np.array(list_of_mean).shape) return list_of_mean, list_of_four_last_embeddings
def generate_embeddings(sentences, method='special_tokens', target_dir_path='/ukp-storage-1/nbeck/evidence/data/', batch_size=500, logging_level=logging.INFO): """ Base method for generating embeddings. Handles the batch-wise calculation of them, thus enabling more efficient calculations in case errors bring a given batch to halt. Resulting embeddings are stored in a file in the given directory. :param logging_level: desired level of logging (from logging library) :param batch_size: amount of sentences to be processes in one run/batch :param sentences: list or array of strings representing the DIRTY sentences, i.e. with markers for the lemmas :param method: how to calculate fixed-size word embeddings for the lemma (which might be tokenized as multiple tokens can be 'special_tokens' or 'mean' :param target_dir_path: path of the directory where embeddings are supposed to be stored """ # set up the logger logging.basicConfig(level=logging_level, format='%(asctime)s - %(message)s') logging.info('Beginning the calculation of embeddings.') # create an empty file file_name = 'word_plus_sen_embeddings_' + method + '_' + strftime( "%Y-%m-%d %H:%M:%S", gmtime()) + '.npy' numpy.save(target_dir_path + file_name, []) # load sentence embeddings model (just) once rather than in every batch loop sen_embd_model = SentenceTransformer('distiluse-base-multilingual-cased') # load lemma embeddings model (just) once rather than every time the respective sub-function is called lemma_embd_model = XLMRobertaModel.from_pretrained('xlm-roberta-base') lemma_embd_model.eval() # load lemma embeddings tokenizer (just) once - you get it lemma_embd_tokenizer = XLMRobertaTokenizer.from_pretrained( 'xlm-roberta-base') if method == 'special_tokens': lemma_embd_tokenizer = XLMRobertaTokenizer.\ from_pretrained('xlm-roberta-base', additional_special_tokens=_extract_lemmas_from_sentences(sentences)) # create a LongSentenceCounter lsc = LongSentenceCounter() # iterate over batches for i in range(int(numpy.ceil(len(sentences) / batch_size))): # determine batch length batch_len = numpy.min([batch_size, len(sentences) - i * batch_size]) # determine current batch current_batch = sentences[i * batch_size:i * batch_size + batch_len] # generate contextualized word embedding for each lemma if method == 'special_tokens': lemma_embeddings = _generate_lemma_embs_with_special_tokens( current_batch, lemma_embd_model, lemma_embd_tokenizer, lsc) elif method == 'mean': lemma_embeddings = _generate_lemma_embs_with_mean( current_batch, lemma_embd_model, lemma_embd_tokenizer, lsc) else: raise Exception( 'Specified method of dealing with multi-token lemma not found') # generate sentence embeddings for sentences sen_embeddings = sen_embd_model.encode(current_batch) # concatenate sentence and word embeddings current_embds = [] for a, b in zip(sen_embeddings, lemma_embeddings): current_embds.append(numpy.append(a, b)) # store/append embeddings to file arr = numpy.load(target_dir_path + file_name, allow_pickle=True) new_embds = numpy.append(arr, current_embds) numpy.save(target_dir_path + file_name, new_embds) # log that current batch was stored successfully logging.info('Batch #' + str(i) + ' of embeddings successfully stored in ' + target_dir_path + file_name + '.') logging.info('ALL EMBEDDINGS CALCULATED SUCCESSFULLY!') lsc.show() lsc.reset() # reshape embeddings, so they can be more intuitively used (up to this point they are one flat list) arr = numpy.load(target_dir_path + file_name, allow_pickle=True) arr = arr.reshape(int(len(arr) / 1280), 1280) numpy.save(target_dir_path + file_name, arr)
def __init__(self): super(TranslationModel, self).__init__() self.encoder = XLMRobertaModel.from_pretrained('xlm-roberta-base') self.linear = nn.Linear(768, 768)
from torch.utils.data import Dataset import numpy as np import utils from torch.utils.data.dataloader import default_collate import logging import os from tqdm import tqdm import matplotlib.pyplot as plt from transformers import Trainer, TrainingArguments, get_linear_schedule_with_warmup from transformers import XLMRobertaTokenizer, XLMRobertaModel from nltk.tokenize import word_tokenize from torchsummary import summary bert_tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base', is_split_into_words=True) bert_model = XLMRobertaModel.from_pretrained('xlm-roberta-base') # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # model = BertModel.from_pretrained("./model") def load_json(json_file): with codecs.open(json_file, 'r', encoding='utf-8') as f: data_list = json.load(f) data_dict = {} for data in data_list: data_dict[data['id']] = data return data_dict def extrace_range(string): ranges = []
def tokenizer_and_model(type_embedding): ######### #PORTUGUESE ######### if type_embedding.split('_')[0] == 'BERT' or type_embedding.split( '_')[0] == 'bert': if type_embedding == 'BERT_portuguese_large_neural_mind': path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_large_portuguese/' elif type_embedding == 'BERT_portuguese_base_neural_mind': path = '/home/jeanfarfan/bin/Semi_supervised_learning/data/Brazilian_Bert/BERT_base_portuguese/' elif type_embedding == 'bert_base_multilingual_cased': path = 'bert-base-multilingual-cased' elif type_embedding == 'bert_base_multilingual_uncased': path = 'bert-base-multilingual-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) special_tokens_dict = { 'additional_special_tokens': ['[USER]', '[SYSTEM]'] } orig_num_tokens = len(tokenizer) num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict) total_num_tokens = orig_num_tokens + num_added_tokens model.resize_token_embeddings(total_num_tokens) elif type_embedding.split('_')[0] == 'xlmroberta': if type_embedding == 'xlmroberta_base': path = 'xlm-roberta-base' elif type_embedding == 'xlmroberta_large': path = 'xlm-roberta-large' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'xlm': path = 'xlm-mlm-100-1280' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True, return_dict=True) ######### #ENGLISH ######### elif type_embedding == 'en_bert_base_uncased': path = 'bert-base-uncased' #load tokenizer and model tokenizer = BertTokenizer.from_pretrained(path) model = BertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'en_xlm_mlm_enfr_1024': path = 'xlm-mlm-enfr-1024' #load tokenizer and model tokenizer = XLMTokenizer.from_pretrained(path) model = XLMModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'en_xlm_roberta_base': path = 'xlm-roberta-base' #load tokenizer and model tokenizer = XLMRobertaTokenizer.from_pretrained(path) model = XLMRobertaModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'distilbert_base_cased': path = 'distilbert-base-cased' #load tokenizer and model tokenizer = DistilBertTokenizer.from_pretrained(path) model = DistilBertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'Mobile_Bert': path = 'google/mobilebert-uncased' #load tokenizer and model tokenizer = MobileBertTokenizer.from_pretrained(path) model = MobileBertModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'Electra': path = 'google/electra-small-discriminator' #load tokenizer and model tokenizer = ElectraTokenizer.from_pretrained(path) model = ElectraModel.from_pretrained(path, output_hidden_states=True, return_dict=True) elif type_embedding == 'BART': path = 'facebook/bart-large' #load tokenizer and model tokenizer = BartTokenizer.from_pretrained(path) model = BartModel.from_pretrained(path, output_hidden_states=True, return_dict=True) return tokenizer, model
def __init__(self): super(xlmr_Classifier, self).__init__() self.xlmr = XLMRobertaModel.from_pretrained("xlm-roberta-large") self.drop = nn.Dropout(p=0.3) self.out = nn.Linear(self.xlmr.config.hidden_size, 2)
args.model_name[:args.model_name.index("-")], args.model_name[args.model_name.index("-") + 1:], ) if model_type == "roberta": embed = RobertaWordPieceEncoder(model_dir_or_name=args.model_name, requires_grad=True, num_aspect=1) elif model_type == "bert": embed = BertWordPieceEncoder(model_dir_or_name=args.model_name, requires_grad=True) elif model_type == "xlnet": embed = XLNetModel.from_pretrained( pretrained_model_name_or_path=args.model_name) elif model_type == "xlmroberta": embed = XLMRobertaModel.from_pretrained( pretrained_model_name_or_path=args.model_name) class AspectModel(nn.Module): def __init__(self, embed, dropout, num_classes, pool="max"): super().__init__() assert pool in ("max", "mean") self.embed = embed self.embed_dropout = nn.Dropout(dropout) if hasattr(embed, "embedding_dim"): embed_size = embed.embedding_dim else: embed_size = embed.config.hidden_size self.ffn = nn.Sequential( nn.Linear(embed_size, embed_size), nn.ReLU(),
from transformers import GPT2Model, GPT2Tokenizer, XLMRobertaModel, XLMRobertaTokenizer # The Gulordava LSTM model can be found here: # https://drive.google.com/open?id=1w47WsZcZzPyBKDn83cMNd0Hb336e-_Sy # Initializing the LSTM lstm, lstm_vocab = make_pretrained_lstm_and_tokenizer() # Initializing the Transformer trans_model_type: str = config.feature_model_type if config.feature_model_type is not 'LSTM' else config.default_trans_model_type if 'distilgpt2' in trans_model_type: trans_model = GPT2Model.from_pretrained(trans_model_type) trans_tokenizer = GPT2Tokenizer.from_pretrained(trans_model_type) elif 'xlm-roberta-base' in trans_model_type: trans_model = XLMRobertaModel.from_pretrained(trans_model_type) trans_tokenizer = XLMRobertaTokenizer.from_pretrained(trans_model_type) # %% [markdown] # # Data # # For this assignment you will train your probes on __treebank__ corpora. A treebank is a corpus that has been *parsed*, and stored in a representation that allows the parse tree to be recovered. Next to a parse tree, treebanks also often contain information about part-of-speech tags, which is exactly what we are after now. # # The treebank you will use for now is part of the Universal Dependencies project. I provide a sample of this treebank as well, so you can test your setup on that before moving on to larger amounts of data. # # Make sure you accustom yourself to the format that is created by the `conllu` library that parses the treebank files before moving on. For example, make sure you understand how you can access the pos tag of a token, or how to cope with the tree structure that is formed using the `to_tree()` functionality. # %% from typing import Callable, Dict, List, Optional, Union from conllu import parse_incr, TokenList from data_tools.data_inits import parse_corpus
def __init__( self, vocabs: Dict[str, Vocabulary], config: Config, pre_load_model: bool = True ): super().__init__(config=config) if pre_load_model: self.xlm_roberta = XLMRobertaModel.from_pretrained( self.config.model_name, output_hidden_states=True ) else: xlm_roberta_config = XLMRobertaConfig.from_pretrained( self.config.model_name, output_hidden_states=True ) self.xlm_roberta = XLMRobertaModel(xlm_roberta_config) self.vocabs = { const.TARGET: vocabs[const.TARGET], const.SOURCE: vocabs[const.SOURCE], } self.mlp = None if self.config.use_mlp: self.mlp = nn.Sequential( nn.Linear(self.xlm_roberta.config.hidden_size, self.config.hidden_size), nn.Tanh(), ) output_size = self.config.hidden_size else: output_size = self.xlm_roberta.config.hidden_size sentence_size = output_size if config.pooling == 'mixed': sentence_size *= 2 self.scalar_mix = ScalarMixWithDropout( mixture_size=self.xlm_roberta.config.num_hidden_layers + 1, # +1 for embeddings do_layer_norm=self.config.scalar_mix_layer_norm, dropout=self.config.scalar_mix_dropout, ) self._sizes = { const.TARGET: output_size, const.TARGET_LOGITS: output_size, const.TARGET_SENTENCE: sentence_size, const.SOURCE: output_size, } self.output_embeddings = self.xlm_roberta.embeddings.word_embeddings self._training_steps_ran = 0 self._is_frozen = False if self.config.freeze: logger.info( 'Freezing XLMRoberta encoder weights; training will not update them' ) for param in self.xlm_roberta.parameters(): param.requires_grad = False self._is_frozen = True if self.config.freeze_for_number_of_steps > 0: # Done inside `forward()` to guarantee we can unfreeze (if optimizer is # built after this, we cannot unfreeze without calling # `optimizer.add_param_group({'params': self.xlm.parameters()})` pass
def create(cls, model_type='camem', model_name="camembert-base", embedding_size=768, hidden_dim=512, rnn_layers=1, lstm_dropout=0.5, device="cuda", mode="weighted", key_dim=64, val_dim=64, num_heads=3, attn_dropout=0.3, self_attention=False, is_require_grad=False): configuration = { 'model_type': model_type, "model_name": model_name, "device": device, "mode": mode, "self_attention": self_attention, "is_freeze": is_require_grad } if 'camem' in model_type: config_bert = CamembertConfig.from_pretrained( model_name, output_hidden_states=True) model = CamembertModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'flaubert' in model_type: config_bert = FlaubertConfig.from_pretrained( model_name, output_hidden_states=True) model = FlaubertModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'XLMRoberta' in model_type: config_bert = XLMRobertaConfig.from_pretrained( model_name, output_hidden_states=True) model = XLMRobertaModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'M-Bert' in model_type: config_bert = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = BertModel.from_pretrained(model_name, config=config_bert) model.to(device) lstm = BiLSTM.create(embedding_size=embedding_size, hidden_dim=hidden_dim, rnn_layers=rnn_layers, dropout=lstm_dropout) attn = MultiHeadAttention(key_dim, val_dim, hidden_dim, num_heads, attn_dropout) model.train() self = cls(model=model, config=configuration, lstm=lstm, attn=attn) # if is_freeze: self.freeze() return self
def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = True, min_freq=1, only_use_pretrain_bpe=False, truncate_embed=True): super().__init__() self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_dir_or_name) self.encoder = XLMRobertaModel.from_pretrained(model_dir_or_name) self.encoder.resize_token_embeddings(len(self.tokenizer)) self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2 encoder_layer_number = len(self.encoder.encoder.layer) if isinstance(layers, list): self.layers = [int(l) for l in layers] elif isinstance(layers, str): self.layers = list(map(int, layers.split(','))) else: raise TypeError("`layers` only supports str or list[int]") for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." else: assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." assert pool_method in ('avg', 'max', 'first', 'last') self.pool_method = pool_method self.include_cls_sep = include_cls_sep self.pooled_cls = pooled_cls self.auto_truncate = auto_truncate # logger.info("Start to generate word pieces for word.") # word_piece_dict = {'<s>': 1, '</s>': 1} # found_count = 0 # new_add_to_bpe_vocab = 0 # unsegment_count = 0 # if "<s>" in vocab: # warnings.warn("<s> detected in your vocabulary. RobertaEmbedding will add <s> and </s> to the begin " # "and end of the input automatically, make sure you don't add <s> and </s> at the begin" # " and end.") # unique = [] # for word, index in vocab: # # word_pieces = [] # word_pieces.extend(self.tokenizer.tokenize( # word))#, add_prefix_space=True)) # if len(word_pieces) > 0: # word_token_ids = self.tokenizer.convert_tokens_to_ids(word_pieces) # if 3 in word_token_ids: # if word_pieces[word_token_ids.index(3)] not in unique: # unique.append(word_pieces[word_token_ids.index(3)]) # unsegment_count += 1 # if not vocab._is_word_no_create_entry(word): # # import pdb;pdb.set_trace() # if index != vocab.unknown_idx and word_pieces[0] == '<unk>': # if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry( # word) and not only_use_pretrain_bpe: # word_piece_dict[word] = 1 # new_add_to_bpe_vocab += 1 # unsegment_count += 1 # continue # found_count += 1 # for word_piece in word_pieces: # word_piece_dict[word_piece] = 1 # # if unsegment_count > 0: # logger.info(f"{unsegment_count} words are unsegmented.") word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: if index == vocab.padding_idx: word = '<pad>' elif index == vocab.unknown_idx: word = '<unk>' word_pieces = self.tokenizer.tokenize(word) word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces) word_to_wordpieces.append(word_pieces) word_pieces_lengths.append(len(word_pieces)) self._cls_index = self.tokenizer.convert_tokens_to_ids('<s>') self._sep_index = self.tokenizer.convert_tokens_to_ids('</s>') self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenizer.convert_tokens_to_ids( '<pad>') self.word_to_wordpieces = np.array(word_to_wordpieces) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) self.encoder.resize_token_embeddings(len(self.tokenizer)) logger.debug("Successfully generate word pieces.")