def __init__(self): super(TextRank, self).__init__ self.bert_embedding = Make_Embedding(tok=CamembertTokenizer( 'C:/Users/theo.roudil-valentin/Documents/Resume/MLSUM/MLSUM_tokenizer.model' ), cpu=psutil.cpu_count()) self.camem = CamembertModel(CamembertConfig())
def __init__( self): #,num_labels=2): Nous on a pas de labels, donc pas besoin super(CustomCamembert, self).__init__() self.camembert = CamembertModel.from_pretrained("camembert-base") self.dropout = nn.Dropout(.05) self.classifier = nn.Linear(768, ) self.MHA = nn.MultiheadAttention(embed_dim=768, num_heads=8)
def __init__(self, auto_model: str, auto_path: str): super().__init__() if "camembert" in auto_model: from transformers import CamembertModel, CamembertTokenizer self.auto_embeddings = CamembertModel.from_pretrained(auto_path) self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path) elif "flaubert" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) self.auto_tokenizer.do_lowercase_and_remove_accent = False elif "xlm" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) elif "bert" in auto_model: from transformers import BertModel, BertTokenizer self.auto_embeddings = BertModel.from_pretrained(auto_path) self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path) else: from transformers import AutoModel, AutoTokenizer, XLMTokenizer self.auto_embeddings = AutoModel.from_pretrained(auto_path) self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path) if isinstance(self.auto_tokenizer, XLMTokenizer): self.auto_tokenizer.do_lowercase_and_remove_accent = False for param in self.auto_embeddings.parameters(): param.requires_grad = False self._is_fixed = True self._output_dim = self.auto_embeddings.config.hidden_size self._begin_special_token_count = self.get_begin_special_token_count() self._padding_id = self.auto_tokenizer.pad_token_id
def __init__(self, bert_model, num_classes): super(UmbertoCustom, self).__init__() self.encoder = CamembertModel.from_pretrained(bert_model) self.classifier = nn.Sequential(nn.Dropout(p=0.1), nn.Linear(768, 768), nn.Tanh(), nn.Dropout(p=0.1), nn.Linear(768, num_classes)) self.loss = nn.CrossEntropyLoss()
def __init__( self, lang: str = "en", ): try: from transformers import (AlbertModel, AlbertTokenizer, BertConfig, BertJapaneseTokenizer, BertModel, CamembertModel, CamembertTokenizer) except ImportError: msg = "importing bert dep failed." msg += "\n try to install sister by `pip install sister[bert]`." raise ImportError(msg) if lang == "en": model_name = "albert-base-v2" tokenizer = AlbertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = AlbertModel.from_pretrained(model_name, config=config) elif lang == "fr": model_name = "camembert-base" tokenizer = CamembertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = CamembertModel.from_pretrained(model_name, config=config) elif lang == "ja": model_name = "cl-tohoku/bert-base-japanese-whole-word-masking" tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = BertModel.from_pretrained(model_name, config=config) self.tokenizer = tokenizer self.model = model
def __init__( self, lang: str = 'en', ): try: from transformers import BertJapaneseTokenizer, AlbertTokenizer, CamembertTokenizer, AutoTokenizer from transformers import AlbertModel, CamembertModel, AutoModel except ImportError: msg = "importing bert dep failed." msg += "\n try to install sister by `pip install sister[bert]`." raise ImportError(msg) if lang == "en": tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") model = AlbertModel.from_pretrained("albert-base-v2") elif lang == "fr": tokenizer = CamembertTokenizer.from_pretrained("camembert-base") model = CamembertModel.from_pretrained("camembert-base") elif lang == "es": tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased") model = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased") elif lang == "ja": tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") self.tokenizer = tokenizer self.model = model
def test_output_embeds_base_model(self): model = CamembertModel.from_pretrained("camembert-base") model.to(torch_device) input_ids = torch.tensor( [[5, 121, 11, 660, 16, 730, 25543, 110, 83, 6]], device=torch_device, dtype=torch.long, ) # J'aime le camembert ! output = model(input_ids)[0] expected_shape = torch.Size((1, 10, 768)) self.assertEqual(output.shape, expected_shape) # compare the actual values for a slice. expected_slice = torch.tensor( [[[-0.0254, 0.0235, 0.1027], [0.0606, -0.1811, -0.0418], [-0.1561, -0.1127, 0.2687]]], device=torch_device, dtype=torch.float, ) # camembert = torch.hub.load('pytorch/fairseq', 'camembert.v0') # camembert.eval() # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach() self.assertTrue( torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
def load(self): """ Load Camembert model from FAIR repo :return: """ self.tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME) self.model = CamembertModel.from_pretrained(MODEL_NAME) self.model.eval()
def __init__( self, camem=CamembertModel(CamembertConfig()), cosim=torch.nn.CosineSimilarity(dim=-1) ) -> None: super(BERTScore, self).__init__ self.make_embedding = TextRank().make_embedding_bert self.camem = camem self.cosim = cosim
def sentence_embeddings(self): if self.finetuned_bert == False: tokenizer = CamembertTokenizer.from_pretrained( stg.REGULAR_CAMEMBERT) model = CamembertModel.from_pretrained(stg.REGULAR_CAMEMBERT) else: tokenizer = AutoTokenizer.from_pretrained( stg.FINED_TUNED_CAMEMBERT) model = CamembertModel.from_pretrained(stg.FINED_TUNED_CAMEMBERT) if torch.cuda.is_available() == True: print( '====== Cuda is Available, GPU will be used for this task ======' ) torch.cuda.empty_cache() model.cuda() device = torch.device("cuda") embedding_all_text = [] number_sentences = len(self.sentences) for i in tqdm(range(0, number_sentences, self.batch_size)): if ((i + self.batch_size) < number_sentences): batch = self.sentences[i:i + self.batch_size] encoded_input = self.get_batch_sentence_tokens( batch, tokenizer) elif (i == number_sentences): pass else: batch = self.sentences[i:] encoded_input = self.get_batch_sentence_tokens( batch, tokenizer) if torch.cuda.is_available() == True: encoded_input.to(device) with torch.no_grad(): model_output = model(**encoded_input) sentence_embeddings_tensor = self.mean_pooling( model_output, encoded_input['attention_mask']) embedding_all_text.append(sentence_embeddings_tensor) if torch.cuda.is_available() == True: del encoded_input del sentence_embeddings_tensor torch.cuda.empty_cache() sentence_embeddings = self.torch_to_array(embedding_all_text) return sentence_embeddings
def __init__( self, device ): #args, , load_pretrained_bert = False, bert_config = None): super(Summarizer, self).__init__() self.device = device self.bert = CamembertModel.from_pretrained("camembert-base") #BertModel.from_pretrained('bert-base-uncased') #Bert(args.temp_dir, load_pretrained_bert, bert_config) self.encoder = Classifier(self.bert.config.hidden_size) self.select_sent = select_sent # self.score=confusion_output self.to(device)
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): super(CamemBERT, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 511: logging.warning("CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511") max_seq_length = 511 self.max_seq_length = max_seq_length if self.do_lower_case is not None: tokenizer_args['do_lower_case'] = do_lower_case self.camembert = CamembertModel.from_pretrained(model_name_or_path, **model_args) self.tokenizer = CamembertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
def load_initial_model(): configuration = get_configuration() config = CamembertConfig( vocab_size=configuration["vocab_size"], hidden_size=configuration["hidden_size"], num_hidden_layers=configuration["num_hidden_layers"], intermediate_size=configuration["intermediate_size"], hidden_act=configuration["hidden_act"], hidden_dropout_prob=configuration["hidden_dropout_prob"], attention_probs_dropout_prob=configuration[ "attention_probs_dropout_prob"], max_position_embeddings=configuration["max_position_embeddings"], type_vocab_size=configuration["type_vocab_size"]) cam_model = CamembertModel(config) return cam_model
def __init__( self, file_service: FileService, device: str, pretrained_representations_options: PretrainedRepresentationsOptions ): super().__init__() self._device = device self.do_not_save: bool = ( not pretrained_representations_options.fine_tune_pretrained and not pretrained_representations_options.fine_tune_after_convergence) self._include_pretrained = pretrained_representations_options.include_pretrained_model self._pretrained_model_size = pretrained_representations_options.pretrained_model_size self._pretrained_weights = pretrained_representations_options.pretrained_weights self._pretrained_max_length = pretrained_representations_options.pretrained_max_length self._pretrained_model: PreTrainedModel = None self._fine_tune_pretrained = pretrained_representations_options.fine_tune_pretrained self._fine_tune_after_convergence = pretrained_representations_options.fine_tune_after_convergence self._include_fasttext_model = pretrained_representations_options.include_fasttext_model if self._include_pretrained and self._pretrained_model_size and self._pretrained_weights: if pretrained_representations_options.pretrained_model == PretrainedModel.BERT: self._pretrained_model = BertModel.from_pretrained( pretrained_representations_options.pretrained_weights) elif pretrained_representations_options.pretrained_model == PretrainedModel.CamemBERT: self._pretrained_model = CamembertModel.from_pretrained( pretrained_representations_options.pretrained_weights) if pretrained_representations_options.fine_tune_pretrained: self._pretrained_model.train() else: self._pretrained_model.eval() if self._include_fasttext_model: assert pretrained_representations_options.fasttext_model is not None, 'fast text model is not supplied when include-fasttext-model is set to true' data_path = file_service.get_initial_data_path() fasttext_path = os.path.join( data_path, 'fasttext', pretrained_representations_options.fasttext_model) assert os.path.exists( fasttext_path), f'fast text model not found in {fasttext_path}' self._fasttext_dimension = pretrained_representations_options.fasttext_model_size self._fasttext_model = fasttext.load_model(fasttext_path)
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): super(CamemBERT, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 511: logging.warning( "CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511" ) max_seq_length = 511 self.max_seq_length = max_seq_length self.camembert = CamembertModel.from_pretrained(model_name_or_path) self.tokenizer = CamembertTokenizer.from_pretrained( model_name_or_path, do_lower_case=do_lower_case) self.cls_token_id = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.cls_token])[0] self.sep_token_id = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.sep_token])[0]
def create(cls, model_type='camem', model_name="camembert-base", embedding_size=768, hidden_dim=512, rnn_layers=1, lstm_dropout=0.5, device="cuda", mode="weighted", key_dim=64, val_dim=64, num_heads=3, attn_dropout=0.3, self_attention=False, is_require_grad=False): configuration = { 'model_type': model_type, "model_name": model_name, "device": device, "mode": mode, "self_attention": self_attention, "is_freeze": is_require_grad } if 'camem' in model_type: config_bert = CamembertConfig.from_pretrained( model_name, output_hidden_states=True) model = CamembertModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'flaubert' in model_type: config_bert = FlaubertConfig.from_pretrained( model_name, output_hidden_states=True) model = FlaubertModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'XLMRoberta' in model_type: config_bert = XLMRobertaConfig.from_pretrained( model_name, output_hidden_states=True) model = XLMRobertaModel.from_pretrained(model_name, config=config_bert) model.to(device) elif 'M-Bert' in model_type: config_bert = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = BertModel.from_pretrained(model_name, config=config_bert) model.to(device) lstm = BiLSTM.create(embedding_size=embedding_size, hidden_dim=hidden_dim, rnn_layers=rnn_layers, dropout=lstm_dropout) attn = MultiHeadAttention(key_dim, val_dim, hidden_dim, num_heads, attn_dropout) model.train() self = cls(model=model, config=configuration, lstm=lstm, attn=attn) # if is_freeze: self.freeze() return self
# #### # bc = BertConfig(output_hidden_states=True) # bert = BertModel(bc) # bert = bert.from_pretrained("bert-base-uncased") # bert = bert.from_pretrained("bert-base-multilingual-cased") # bert = bert.eval() # bert = bert.to(device) # bert_tok = BertTokenizer.from_pretrained("bert-base-multilingual-cased") # #### # FR # # #### cbc = CamembertConfig(output_hidden_states=True) bert = CamembertModel(cbc) bert.from_pretrained("camembert-base") bert = bert.eval() bert = bert.to(device) bert_tok = CamembertTokenizer.from_pretrained("camembert-base") app = Flask(__name__) # create a StackedEmbedding object that combines glove and forward/backward flair embeddings SIZE_EMBED = -1 adaptive_pool = nn.AdaptiveAvgPool1d(SIZE_EMBED) if SIZE_EMBED > 0 else None embedder = ELMoEmbeddings("small") # bert_model_or_path="distilbert-base-uncased", # pooling_operation="mean", use_scalar_mix=True)
def __init__(self, model_dir_or_name: str, vocab: Vocabulary, layers: str = '-1', pool_method: str = 'first', include_cls_sep: bool = False, pooled_cls: bool = False, auto_truncate: bool = True, min_freq=1, only_use_pretrain_bpe=False, truncate_embed=True): super().__init__() self.tokenizer = CamembertTokenizer.from_pretrained(model_dir_or_name) self.encoder = CamembertModel.from_pretrained(model_dir_or_name) self.encoder.resize_token_embeddings(len(self.tokenizer)) self._max_position_embeddings = self.encoder.config.max_position_embeddings - 2 encoder_layer_number = len(self.encoder.encoder.layer) if isinstance(layers, list): self.layers = [int(l) for l in layers] elif isinstance(layers, str): self.layers = list(map(int, layers.split(','))) else: raise TypeError("`layers` only supports str or list[int]") for layer in self.layers: if layer < 0: assert -layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." else: assert layer <= encoder_layer_number, f"The layer index:{layer} is out of scope for " \ f"a bert model with {encoder_layer_number} layers." assert pool_method in ('avg', 'max', 'first', 'last') self.pool_method = pool_method self.include_cls_sep = include_cls_sep self.pooled_cls = pooled_cls self.auto_truncate = auto_truncate logger.info("Start to generate word pieces for word.") word_piece_dict = {'<s>': 1, '</s>': 1} found_count = 0 new_add_to_bpe_vocab = 0 unsegment_count = 0 if "<s>" in vocab: warnings.warn("<s> detected in your vocabulary. RobertaEmbedding will add <s> and </s> to the begin " "and end of the input automatically, make sure you don't add <s> and </s> at the begin" " and end.") unique = [] for word, index in vocab: word_pieces = [] word_pieces.extend(self.tokenizer.tokenize( word)) # , add_prefix_space=True)) word_token_ids = self.tokenizer.convert_tokens_to_ids(word_pieces) if 3 in word_token_ids: if word_pieces[word_token_ids.index(3)] not in unique: unique.append(word_pieces[word_token_ids.index(3)]) unsegment_count += 1 if not vocab._is_word_no_create_entry(word): if index != vocab.unknown_idx and word_pieces[0] == '<unk>': if vocab.word_count[word] >= min_freq and not vocab._is_word_no_create_entry( word) and not only_use_pretrain_bpe: word_piece_dict[word] = 1 new_add_to_bpe_vocab += 1 unsegment_count += 1 continue found_count += 1 for word_piece in word_pieces: word_piece_dict[word_piece] = 1 if unsegment_count > 0: logger.info(f"{unsegment_count} words are unsegmented.") word_to_wordpieces = [] word_pieces_lengths = [] for word, index in vocab: if index == vocab.padding_idx: word = '<pad>' elif index == vocab.unknown_idx: word = '<unk>' word_pieces = self.tokenizer.tokenize(word) word_pieces = self.tokenizer.convert_tokens_to_ids(word_pieces) word_to_wordpieces.append(word_pieces) word_pieces_lengths.append(len(word_pieces)) self._cls_index = self.tokenizer.convert_tokens_to_ids('<s>') self._sep_index = self.tokenizer.convert_tokens_to_ids('</s>') self._word_pad_index = vocab.padding_idx self._wordpiece_pad_index = self.tokenizer.convert_tokens_to_ids( '<pad>') self.word_to_wordpieces = np.array(word_to_wordpieces) self.register_buffer('word_pieces_lengths', torch.LongTensor(word_pieces_lengths)) self.encoder.resize_token_embeddings(len(self.tokenizer)) logger.debug("Successfully generate word pieces.")
def __init__(self, text_ptm_dir, text_ft_dim, num_classes): super(TextOnly, self).__init__() self.camembert = CamembertModel.from_pretrained(text_ptm_dir) self.classifier = nn.Linear(text_ft_dim, num_classes, bias = False)
def main(): if args.input_file : input_coprus=readfile(args.input_file) else : print("Could not find the input file !") exit() if args.output_file : output_file=open(args.output_file,"w") else : print("Could not find the output file !") exit() tokenizer = CamembertTokenizer.from_pretrained('camembert/camembert-base-ccnet') # load model model = CamembertModel.from_pretrained('camembert/camembert-base-ccnet') # read_data for line in input_coprus: line=line.rstrip() sent=line.split(' ') # encode() automatically adds the classification token <s> token_ids = tokenizer.encode(line) tokens = [tokenizer._convert_id_to_token(idx) for idx in token_ids] # unsqueeze token_ids because batch_size=1 token_ids = torch.tensor(token_ids).unsqueeze(0) # forward method returns a tuple (we only want the logits : last output layer) # squeeze() because batch_size=1 output = model(token_ids)[0].squeeze() #output[0] is th output of CLS token (<s>), which is the first token, can be considered as the sentence embeddings # mapp each word to its sentence embeddings taking into account splitted words embeddings=len(sent)*[[]] words=len(sent)*[""] l=-1 for i in range (1,len(output)-1): if re.match("^▁",tokens[i]) : # write to file if l != -1: if len( embeddings[l] ) >1: em=numpy.array(embeddings[l]) word_vec=list(numpy.sum(em,axis=0)) else: word_vec=embeddings[l][0] output_file.write(words[l]+" "+" ".join([str(x) for x in word_vec])+"\n") l=l+1 print (i, l, tokens[i]) words[l]=re.sub("▁","",tokens[i]) embeddings[l]=[output[i].detach().numpy()] else: print (i, l, tokens[i]) words[l]=words[l]+tokens[i] print (words[l]) embeddings[l].append(output[i].detach().numpy()) if len( embeddings[l] ) >1: em=numpy.array(embeddings[l]) word_vec=list(numpy.sum(em,axis=0)) else: word_vec=embeddings[l][0] output_file.write(words[l]+" "+" ".join([str(x) for x in word_vec])+"\n") output_file.write("\n")
pred = classifier.predict(cls_embedding) top_emoji = emoji_list[np.argmax(pred)] top3_emojis = [emoji_list[k] for k in np.argpartition(pred[0], -3)[-3:].tolist()] top3_emojis = ' '.join(top3_emojis) top_proba = max(pred[0]) top3_proba = [pred[0][k] for k in np.argpartition(pred[0], -3)[-3:].tolist()] print('\nPrediction succesfully run.\n') return render_template('index.html', top1=top_emoji, top3=top3_emojis, top1proba=top_proba, top3proba=top3_proba) if __name__ == "__main__": # load model print('Loading model...') tokenizer = CamembertTokenizer.from_pretrained('camembert-base') embedder = CamembertModel.from_pretrained('camembert-base') classifier = keras.models.load_model("../models/100k_sentences_averaged_embedding.h5") print('\nModel successfuly loaded!\n') # load emoji list with open('top100.txt') as f: emoji_list = ast.literal_eval(f.read()) app.run(host="0.0.0.0", port=5000, debug=True)
# On va tester deux Embedding différentes : CamemBERT et Word2Vec # D'autres sont possibles évidemment. #On va entrainer un tokenizer : ME = Make_Extractive(cpu=cpu) tok = ME.make_tokenizer(Art, 12000, 'MLSUM_tokenizer', name='MLSUM_tokenizer') # On a déjà un Tokenizer entraîné, donc on va le réutiliser : #On rentre notre SentencePiece model dans le tokenizer Camembert #%% from transformers import CamembertTokenizer tok = CamembertTokenizer('MLSUM_tokenizer.model') # %% # On charge le modèle BERT from transformers import CamembertModel, BertModel, RobertaModel camem = CamembertModel.from_pretrained("camembert-base") camem.config.hidden_size #%% camem.eval() encod = tok(articles[0][0:1]) input_id = torch.tensor(encod['input_ids']) att_mask = torch.tensor(encod['attention_mask']) # %% embedding = camem(input_id, att_mask) # %% class Make_Embedding(): def __init__(self, tok=None, cpu=None) -> None: super(Make_Embedding, self).__init__ self.tokenizer = tok
# This env variable will disable SSL verification # for requests (because of Milliman SSL) # https://stackoverflow.com/questions/48391750/disable-python-requests-ssl-validation-for-an-imported-module os.environ["CURL_CA_BUNDLE"] = "" # disable warnings urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") BASE_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..") DATA_PATH = os.path.join(BASE_PATH, "input/tweets.csv") MODEL_PATH = os.path.join(BASE_PATH, "output/camembert") # MODEL SAVE PATH # Set Pytorch device DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") EPOCHS = 10 MAX_LEN = 128 TRAIN_BATCH_SIZE = 32 VALID_BATCH_SIZE = 4 WEIGHT_DECAY = 0.01 TOKENIZER = CamembertTokenizer.from_pretrained( "camembert-base", do_lower_case=True, verify=False, ) CAMEMBERT_MODEL = CamembertModel.from_pretrained("camembert-base") # ? valid batch size different de train batch size # ? BCEWithLogit more than BCE
b, _ = self.make_embedding(article, self.camem) b = torch.stack(b) VSA = b.mean(dim=0) score = self.cosim(VSA, b) return score def make_summary(self, article, k=3): score = self.make_score(article) score = score.topk(k=k)[1] resume = [article[i] for i in score] return resume #%% from transformers import CamembertModel, CamembertConfig camem = CamembertModel(CamembertConfig()) #.from_pretrained("camembert-base") #%% i = 1 TR = TextRank() b, d = TR.make_embedding_bert(Art[i], camem) b #%% mb = TR.mat_sim(b) sb = TR.scores(mb) sb = [s[0] for s in sb] #%% w = TR.make_embedding_W2V(Art[0], W2V) print(len(w), w[0].shape) mw = TR.mat_sim(w) sw = TR.scores(mw) sw = [s[0] for s in sw]
def test_camembert_embeddings(): camembert_model: str = "camembert-base" tokenizer = CamembertTokenizer.from_pretrained(camembert_model) model = CamembertModel.from_pretrained( pretrained_model_name_or_path=camembert_model, output_hidden_states=True ) model.to(flair.device) model.eval() s: str = "J'aime le camembert !" with torch.no_grad(): tokens = tokenizer.tokenize("<s>" + s + "</s>") indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 # # '<s>', '▁J', "'", 'aime', '▁le', '▁ca', 'member', 't', '▁!', '</s>' # \ | / | \ | / | # J'aime le camembert ! # # 0 1 2 3 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = CamembertEmbeddings( pretrained_model_name_or_path=camembert_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") camembert_first_subword_embedding_ref = first_layer[5].tolist() camembert_first_subword_embedding_actual = sentence_first_subword.tokens[ 2 ].embedding.tolist() assert ( camembert_first_subword_embedding_ref == camembert_first_subword_embedding_actual ) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") camembert_last_subword_embedding_ref = first_layer[7].tolist() camembert_last_subword_embedding_actual = sentence_last_subword.tokens[ 2 ].embedding.tolist() assert ( camembert_last_subword_embedding_ref == camembert_last_subword_embedding_actual ) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last" ) camembert_first_last_subword_embedding_ref = torch.cat( [first_layer[5], first_layer[7]] ).tolist() camembert_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 2 ].embedding.tolist() assert ( camembert_first_last_subword_embedding_ref == camembert_first_last_subword_embedding_actual ) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") pcamembert_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[5], first_layer[6], first_layer[7]] ).tolist() camembert_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 2 ].embedding.tolist() assert ( pcamembert_mean_subword_embedding_ref == camembert_mean_subword_embedding_actual ) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence( sentence="Paris", pooling_operation="first", layers="1,2,3,4" ) ref_embedding_size = 4 * 768 actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="TGW", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * 768 actual_embedding_size = len(sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
def extractHiddenState(): transformer = CamembertModel.from_pretrained("camembert-base") return ExtractHiddenState(transformer)
def __init__(self, pretrained_model_name='camembert-base'): super(CamembertClassifier, self).__init__() self.encoder = CamembertModel.from_pretrained(pretrained_model_name,output_attentions=True) self.cls_layer = nn.Linear(self.encoder.pooler.dense.out_features, 5)