def __init__( self, lang: str = 'en', ): try: from transformers import BertJapaneseTokenizer, AlbertTokenizer, CamembertTokenizer, AutoTokenizer from transformers import AlbertModel, CamembertModel, AutoModel except ImportError: msg = "importing bert dep failed." msg += "\n try to install sister by `pip install sister[bert]`." raise ImportError(msg) if lang == "en": tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2") model = AlbertModel.from_pretrained("albert-base-v2") elif lang == "fr": tokenizer = CamembertTokenizer.from_pretrained("camembert-base") model = CamembertModel.from_pretrained("camembert-base") elif lang == "es": tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased") model = AutoModel.from_pretrained("dccuchile/bert-base-spanish-wwm-uncased") elif lang == "ja": tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") model = BertModel.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking") self.tokenizer = tokenizer self.model = model
def load_tokenizer(self): # Load the tokenizer. if self.verbose == True: print('Loading {} tokenizer...'.format(self.model_name)) if self.model_name == 'bert': self.tokenizer = BertTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'distilbert': self.tokenizer = DistilBertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'albert': self.tokenizer = AlbertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'bart': self.tokenizer = BartTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'xlnet': self.tokenizer = XLNetTokenizer.from_pretrained(self.model_type, do_lower_case=True) if self.model_name == 'roberta': self.tokenizer = RobertaTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'camenbert': self.tokenizer = CamembertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'flaubert': self.tokenizer = FlaubertTokenizer.from_pretrained( self.model_type, do_lower_case=True) if self.model_name == 'gpt2': self.tokenizer = GPT2Tokenizer.from_pretrained(self.model_type)
def run_generation(text): tokenizer = CamembertTokenizer.from_pretrained("camembert-base") tokenized_text = tokenizer.encode(text, add_special_tokens=True) print('\nTokenized text: %s (%s)' %(tokenized_text, tokenizer.decode(tokenized_text))) input_ids = torch.tensor(tokenized_text).unsqueeze(0) model = CamembertForMaskedLM.from_pretrained("camembert-base", resume_download=True) model.eval() with torch.no_grad(): outputs = model(input_ids) masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item() print('Masked index: %d (%s)\n' % (masked_index, tokenizer.mask_token)) last_hidden_states = outputs[0] logits = last_hidden_states[0, masked_index, :] prob = logits.softmax(dim=0) values, indices = prob.topk(k=5, dim=0) return list(zip( [tokenizer.decode([x]) for x in indices], [round(v.item(), 2) for v in values] ))
def __init__(self): super(TextRank, self).__init__ self.bert_embedding = Make_Embedding(tok=CamembertTokenizer( 'C:/Users/theo.roudil-valentin/Documents/Resume/MLSUM/MLSUM_tokenizer.model' ), cpu=psutil.cpu_count()) self.camem = CamembertModel(CamembertConfig())
def __init__(self, auto_model: str, auto_path: str): super().__init__() if "camembert" in auto_model: from transformers import CamembertModel, CamembertTokenizer self.auto_embeddings = CamembertModel.from_pretrained(auto_path) self.auto_tokenizer = CamembertTokenizer.from_pretrained(auto_path) elif "flaubert" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) self.auto_tokenizer.do_lowercase_and_remove_accent = False elif "xlm" in auto_model: from transformers import XLMModel, XLMTokenizer self.auto_embeddings = XLMModel.from_pretrained(auto_path) self.auto_tokenizer = XLMTokenizer.from_pretrained(auto_path) elif "bert" in auto_model: from transformers import BertModel, BertTokenizer self.auto_embeddings = BertModel.from_pretrained(auto_path) self.auto_tokenizer = BertTokenizer.from_pretrained(auto_path) else: from transformers import AutoModel, AutoTokenizer, XLMTokenizer self.auto_embeddings = AutoModel.from_pretrained(auto_path) self.auto_tokenizer = AutoTokenizer.from_pretrained(auto_path) if isinstance(self.auto_tokenizer, XLMTokenizer): self.auto_tokenizer.do_lowercase_and_remove_accent = False for param in self.auto_embeddings.parameters(): param.requires_grad = False self._is_fixed = True self._output_dim = self.auto_embeddings.config.hidden_size self._begin_special_token_count = self.get_begin_special_token_count() self._padding_id = self.auto_tokenizer.pad_token_id
def get_features(input_text, extractor): # REMINDER: uncomment dependencies in requirements.txt needed for the feature extractor if extractor == 'wangchanberta': # import transformers from tqdm.auto import tqdm from transformers import CamembertTokenizer, pipeline # create tokenizer & feature extractor tokenizer = CamembertTokenizer.from_pretrained( 'airesearch/wangchanberta-base-att-spm-uncased', revision='main') tokenizer.additional_special_tokens = [ '<s>NOTUSED', '</s>NOTUSED', '<_>' ] feature_extractor = pipeline( task='feature-extraction', tokenizer=tokenizer, model=f'airesearch/wangchanberta-base-att-spm-uncased', revision='main') # get features from last 4 states input_text = input_text[:415] last_k = 4 hidden_states = feature_extractor(input_text)[0] last_k_layers = [ hidden_states[i] for i in [-i for i in range(1, last_k + 1)] ] cat_hidden_states = np.array(sum(last_k_layers, [])) return cat_hidden_states[None, :] else: from sentence_transformers import SentenceTransformer model = SentenceTransformer('paraphrase-xlm-r-multilingual-v1') return model.encode(input)
def __init__( self, lang: str = "en", ): try: from transformers import (AlbertModel, AlbertTokenizer, BertConfig, BertJapaneseTokenizer, BertModel, CamembertModel, CamembertTokenizer) except ImportError: msg = "importing bert dep failed." msg += "\n try to install sister by `pip install sister[bert]`." raise ImportError(msg) if lang == "en": model_name = "albert-base-v2" tokenizer = AlbertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = AlbertModel.from_pretrained(model_name, config=config) elif lang == "fr": model_name = "camembert-base" tokenizer = CamembertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = CamembertModel.from_pretrained(model_name, config=config) elif lang == "ja": model_name = "cl-tohoku/bert-base-japanese-whole-word-masking" tokenizer = BertJapaneseTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name, output_hidden_states=True) model = BertModel.from_pretrained(model_name, config=config) self.tokenizer = tokenizer self.model = model
def predict_camembert(df: pd.DataFrame) -> pd.DataFrame: """ predict the sentiment of reviews :param df: dataframe with reviews :return: dataframe: dataframe with prediction of reviews """ df['space'] = ' ' df['comments'] = df[['titre', 'space', 'comment']].fillna('').sum(axis=1) df = df.dropna(subset=['comments'], axis="rows") comments = df['comments'].to_list() # camemBERT state_dict = torch.load("camemBERT_38000_state_dict.pt", map_location=torch.device('cpu')) model = CamembertForSequenceClassification.from_pretrained( 'camembert-base', num_labels=2, state_dict=state_dict) # Initialize CamemBERT tokenizer tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True) # Encode the comments tokenized_comments_ids = [ tokenizer.encode(comment, add_special_tokens=True, max_length=MAX_LEN) for comment in comments ] # Pad the resulted encoded comments tokenized_comments_ids = pad_sequences(tokenized_comments_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") # Create attention masks attention_masks = [] for seq in tokenized_comments_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) prediction_inputs = torch.tensor(tokenized_comments_ids) prediction_masks = torch.tensor(attention_masks) predictions = [] with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(prediction_inputs.to(device), token_type_ids=None, attention_mask=prediction_masks.to(device)) logits = outputs[0] logits = logits.detach().cpu().numpy() predictions.extend(np.argmax(logits, axis=1).flatten()) df = pd.DataFrame( data={ "site": df["site"], "date": df["date"], "review": df["review"], "sentiment": predictions }) return df
def __init__( self, pretrained_embedding=None, architecture_function=None, text_input_column="clean_text", meta_input_list=("extension", "dayofweek", "hour", "min"), vocab_size=25000, seq_size=100, embedding_dim=200, loss="categorical_crossentropy", activation="softmax", batch_size=4096, n_epochs=15, bert_tokenizer="jplu/tf-camembert-base", bert_model="jplu/tf-camembert-base", **kwargs, ): self.architecture_function = architecture_function self.pretrained_embedding = pretrained_embedding if self.architecture_function.__name__ != "bert_model": self.tokenizer = Tokenizer(input_column=text_input_column) elif "camembert" in bert_tokenizer.lower(): # Prevent the HuggingFace dependency try: from transformers import CamembertTokenizer self.tokenizer = CamembertTokenizer.from_pretrained( bert_tokenizer) except ModuleNotFoundError: raise ( """Please install transformers 3.4.0 (only version currently supported) pip install melusine[transformers]""") elif "flaubert" in bert_tokenizer.lower(): # Prevent the HuggingFace dependency try: from transformers import XLMTokenizer self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer) except ModuleNotFoundError: raise ( """Please install transformers 3.4.0 (only version currently supported) pip install melusine[transformers]""") else: raise NotImplementedError( "Bert tokenizer {} not implemented".format(bert_tokenizer)) self.text_input_column = text_input_column self.meta_input_list = meta_input_list self.vocab_size = vocab_size self.seq_size = seq_size self.embedding_dim = embedding_dim self.loss = loss self.activation = activation self.batch_size = batch_size self.n_epochs = n_epochs self.bert_model = bert_model self.nb_labels = 0 self.nb_meta_features = 0 self.vocabulary = [] self.vocabulary_dict = {}
def load(self): """ Load Camembert model from FAIR repo :return: """ self.tokenizer = CamembertTokenizer.from_pretrained(MODEL_NAME) self.model = CamembertModel.from_pretrained(MODEL_NAME) self.model.eval()
def get_roberta_tokenizer(pretrained_cfg_name: str, do_lower_case: bool = True): # still uses HF code for tokenizer since they are the same if "camembert" in pretrained_cfg_name: return CamembertTokenizer.from_pretrained(pretrained_cfg_name, do_lower_case=do_lower_case) return RobertaTokenizer.from_pretrained(pretrained_cfg_name, do_lower_case=do_lower_case)
def encode_tweets(tweets, max_length=MAX_LENGTH): tokenizer = Tokenizer.from_pretrained("camembert-base") encoded = np.zeros(shape=(len(tweets), max_length), dtype=np.int32) for i, tweet in enumerate(tweets): tweet_encoded = tokenizer.encode(tweet) encoded[i][:len(tweet_encoded)] = tweet_encoded return encoded
def __init__(self, language): self.language = language if self.language == "French": self.tokenizer = CamembertTokenizer.from_pretrained( Transformers[self.language]) else: self.tokenizer = BertTokenizer.from_pretrained( Transformers[self.language])
def main(): #argparser parser = argparse.ArgumentParser( prog="train_mlm_camembert_thai.py", description="train mlm for Camembert with huggingface Trainer", ) #required parser.add_argument("--bpe_tokenizer", type=str, default='sentencepiece', help='Specify the name of BPE Tokenizer') parser.add_argument("--vocab_size", type=int, default=52000) parser.add_argument("--min_frequency", type=int, default=2) parser.add_argument( "--train_dir", type=str, ) parser.add_argument( "--output_dir", type=str, ) parser.add_argument("--ext", type=str, default='.txt') args = parser.parse_args() fnames = [str(x) for x in glob.glob(f"{args.train_dir}/*{args.ext}")] # Initialize a tokenizer if args.bpe_tokenizer == 'byte_level': _BPE_TOKENIZER = ByteLevelBPETokenizer() if args.bpe_tokenizer == 'char': _BPE_TOKENIZER = CharBPETokenizer() if args.bpe_tokenizer == 'sentencepiece': _BPE_TOKENIZER = SentencePieceBPETokenizer() tokenizer = _BPE_TOKENIZER # Customize training tokenizer.train(files=fnames, vocab_size=args.vocab_size, min_frequency=args.min_frequency, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model(args.output_dir) #test tokenizer = CamembertTokenizer.from_pretrained(args.output_dir) print(tokenizer.encode_plus('สวัสดีครับ hello world'))
def __init__(self, sentences, NUM_CLASS, seed_val=42, random_state=2018, evaluate_score=flat_accuracy): self.seed_val = seed_val self.random_state = random_state self.evaluate_score = evaluate_score self.NUM_CLASS = NUM_CLASS print('Loading CamemBERT tokenizer...') # Load the CamemBERT tokenizer. tokenizer = CamembertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) print('Set max_length as: ', min(512, np.max(np.array([len(tokenizer.encode(i, add_special_tokens=True)) for i in sentences]))) )
def __init__(self, DIR, filename): self.path = os.path.join(DIR, filename) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.tokenizer = CamembertTokenizer.from_pretrained( config["BERT_MODEL"]) classifier = CamembertForSequenceClassification.from_pretrained( config['BERT_MODEL'], num_labels=len(config["CLASS_NAMES"])) classifier.load_state_dict( torch.load(self.path, map_location=self.device)) classifier = classifier.eval() self.classifier = classifier.to(self.device)
def __init__(self): self.model_name = "airesearch/wangchanberta-base-att-spm-uncased" self.target_tokenizer = CamembertTokenizer self.tokenizer = CamembertTokenizer.from_pretrained(self.model_name, revision='main') self.tokenizer.additional_special_tokens = [ '<s>NOTUSED', '</s>NOTUSED', '<_>' ] self.fill_mask = pipeline(task='fill-mask', tokenizer=self.tokenizer, model=f'{self.model_name}', revision='main') self.MASK_TOKEN = self.tokenizer.mask_token
def __init__( self, arguments_service: PretrainedArgumentsService): super().__init__() pretrained_weights = arguments_service.pretrained_weights configuration = arguments_service.configuration self._arguments_service = arguments_service self._tokenizer: CamembertTokenizer = CamembertTokenizer.from_pretrained(pretrained_weights) self._sign_tokens = [',', '.', ';'] self._subword_prefix_symbol = '▁'
def __init__(self, model_dir_or_name: str, layers: str = '-1', pooled_cls: bool = False): super().__init__() self.tokenzier = CamembertTokenizer.from_pretrained(model_dir_or_name) self.encoder = CamembertForTokenClassification.from_pretrained( model_dir_or_name) self._cls_index = self.tokenzier.encoder['<s>'] self._sep_index = self.tokenzier.encoder['</s>'] self._wordpiece_pad_index = self.tokenzier.encoder['<pad>'] self._wordpiece_unknown_index = self.tokenzier.encoder['<unk>'] self.pooled_cls = pooled_cls
def get_train_valid_test_fine(bert_model, max_seq_lenght): tokenizer = CamembertTokenizer.from_pretrained(bert_model) csv_train = pd.read_csv("../data/fine_train.csv") train, valid = train_test_split(csv_train, test_size=0.2) test = pd.read_csv("../data/fine_test.csv") train_dl = _get_dataloader(train, tokenizer, max_seq_lenght) valid_dl = _get_dataloader(valid, tokenizer, max_seq_lenght, drop_last=False) test_dl = _get_dataloader(test, tokenizer, max_seq_lenght, drop_last=False) return train_dl, valid_dl, test_dl
def evaluate(): model = load_model(save_model_location) tokenizer = CamembertTokenizer.from_pretrained(tokenizer_location) testloader = load_data() softmax = torch.nn.Softmax(dim=1) iter_loader = iter(testloader) correct = 0 tp, tn, fp, fn = 0, 0, 0, 0 # true-positive, true-negative, false-positive, false-negatives print("Starting Evaluation") total = 0 for data in tqdm(iter_loader): data["sentence"] = tokenizer(data["sentence"], padding=True, max_length=512) data["sentence"]["input_ids"] = list( map(lambda x: x[:512], data["sentence"]["input_ids"])) data["sentence"]["attention_mask"] = list( map(lambda x: x[:512], data["sentence"]["attention_mask"])) data["sentence"]["input_ids"] = torch.tensor( data["sentence"]["input_ids"], dtype=torch.long, device=cuda0) data["sentence"]["attention_mask"] = torch.tensor( data["sentence"]["attention_mask"], device=cuda0) output = model(data["sentence"]["input_ids"], data["sentence"]["attention_mask"]) # For all data in 1 batch (Here 2 datasets are present in a single batch) for i in range(len(data["label"])): total += 1 output = softmax(output) actual = data["label"][i].item() pred = torch.argmax(output[i]).item() if pred == actual: correct += 1 if actual: # if 1 if pred: tp += 1 else: fn += 1 else: if pred: fp += 1 else: tn += 1 print("Percentage of correct predictions: {}".format( (correct / total * 100.0))) print("F-0.5 value is {}".format(calculate_f_beta(tp, fp, tn, fn)))
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): super(CamemBERT, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 511: logging.warning("CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511") max_seq_length = 511 self.max_seq_length = max_seq_length if self.do_lower_case is not None: tokenizer_args['do_lower_case'] = do_lower_case self.camembert = CamembertModel.from_pretrained(model_name_or_path, **model_args) self.tokenizer = CamembertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
def __init__(self, model_name: str, num_labels: int = None, max_length: int = None, device: str = None, tokenizer_args: Dict = {}): """ A CrossEncoder takes exactly two sentences / texts as input and either predicts a score or label for this sentence pair. It can for example predict the similarity of the sentence pair on a scale of 0 ... 1. It does not yield a sentence embedding and does not work for individually sentences. :param model_name: Any model name from Huggingface Models Repository that can be loaded with AutoModel. We provide several pre-trained CrossEncoder models that can be used for common tasks :param num_labels: Number of labels of the classifier. If 1, the CrossEncoder is a regression model that outputs a continous score 0...1. If > 1, it output several scores that can be soft-maxed to get probability scores for the different classes. :param max_length: Max length for input sequences. Longer sequences will be truncated. If None, max length of the model will be used :param device: Device that should be used for the model. If None, it will use CUDA if available. :param tokenizer_args: Arguments passed to AutoTokenizer """ self.config = AutoConfig.from_pretrained(model_name) classifier_trained = True if self.config.architectures is not None: classifier_trained = any([ arch.endswith('ForSequenceClassification') for arch in self.config.architectures ]) if num_labels is None and not classifier_trained: num_labels = 1 if num_labels is not None: self.config.num_labels = num_labels self.model = AutoModelForSequenceClassification.from_pretrained( model_name, config=self.config) # self.tokenizer = AutoTokenizer.from_pretrained(model_name, **tokenizer_args) self.tokenizer = CamembertTokenizer.from_pretrained( model_name, **tokenizer_args) self.max_length = max_length if device is None: device = "cuda" if torch.cuda.is_available() else "cpu" logger.info("Use pytorch device: {}".format(device)) self._target_device = torch.device(device)
def init_nlp(self, model_path="model_nlp.pt"): try: nlp = spacy.load("fr_core_news_sm") except: os.system("python -m spacy download fr") os.system("python -m spacy download fr_core_news_md") # load model camembert state_dict = torch.load(model_path, map_location=torch.device('cpu')) #print("Loading trained model...") model = CamembertForSequenceClassification.from_pretrained( 'camembert-base', num_labels=2, state_dict=state_dict) #print("Trained model loaded!") # load TOKENIZER camembert TOKENIZER = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True) return model, TOKENIZER
def create(cls, data_file, image_dir, transform, labels_path, pad_idx=0, tokenizer=None, model_type=None, min_char_len=1, max_seq_length=510, model_name="camembert-base", clear_cache=False, is_cls=True): if tokenizer is None: if 'camem' in model_type: tokenizer = CamembertTokenizer.from_pretrained(model_name) elif 'flaubert' in model_type: tokenizer = FlaubertTokenizer.from_pretrained(model_name) elif 'XLMRoberta' in model_type: tokenizer = XLMRobertaTokenizer.from_pretrained(model_name) elif 'M-Bert' in model_type: tokenizer = BertTokenizer.from_pretrained(model_name) with open(data_file, 'rb') as f: data = pickle.load(f) # data = data_file idx2labels, labels2idx = cls.create_labels(labels_path) config = { "min_char_len": min_char_len, "model_name": model_name, "max_sequence_length": max_seq_length, "clear_cache": clear_cache, "pad_idx": pad_idx, "is_cls": is_cls, "idx2labels": idx2labels, "labels2idx": labels2idx } self = cls(data, image_dir, transform, tokenizer, config) return self
def __init__( self, pretrained_embedding=None, architecture_function=None, text_input_column="clean_text", meta_input_list=["extension", "dayofweek", "hour", "min"], vocab_size=25000, seq_size=100, embedding_dim=200, loss="categorical_crossentropy", activation="softmax", batch_size=4096, n_epochs=15, bert_tokenizer="jplu/tf-camembert-base", bert_model="jplu/tf-camembert-base", **kwargs, ): self.architecture_function = architecture_function self.pretrained_embedding = pretrained_embedding if self.architecture_function.__name__ != "bert_model": self.tokenizer = Tokenizer(input_column=text_input_column) elif "camembert" in bert_tokenizer.lower(): self.tokenizer = CamembertTokenizer.from_pretrained(bert_tokenizer) elif "flaubert" in bert_tokenizer.lower(): self.tokenizer = XLMTokenizer.from_pretrained(bert_tokenizer) else: raise NotImplementedError( "Bert tokenizer {} not implemented".format(bert_tokenizer)) self.text_input_column = text_input_column self.meta_input_list = meta_input_list self.vocab_size = vocab_size self.seq_size = seq_size self.embedding_dim = embedding_dim self.loss = loss self.activation = activation self.batch_size = batch_size self.n_epochs = n_epochs self.bert_model = bert_model self.nb_labels = 0 self.nb_meta_features = 0 self.vocabulary = [] self.vocabulary_dict = {}
def sentence_embeddings(self): if self.finetuned_bert == False: tokenizer = CamembertTokenizer.from_pretrained( stg.REGULAR_CAMEMBERT) model = CamembertModel.from_pretrained(stg.REGULAR_CAMEMBERT) else: tokenizer = AutoTokenizer.from_pretrained( stg.FINED_TUNED_CAMEMBERT) model = CamembertModel.from_pretrained(stg.FINED_TUNED_CAMEMBERT) if torch.cuda.is_available() == True: print( '====== Cuda is Available, GPU will be used for this task ======' ) torch.cuda.empty_cache() model.cuda() device = torch.device("cuda") embedding_all_text = [] number_sentences = len(self.sentences) for i in tqdm(range(0, number_sentences, self.batch_size)): if ((i + self.batch_size) < number_sentences): batch = self.sentences[i:i + self.batch_size] encoded_input = self.get_batch_sentence_tokens( batch, tokenizer) elif (i == number_sentences): pass else: batch = self.sentences[i:] encoded_input = self.get_batch_sentence_tokens( batch, tokenizer) if torch.cuda.is_available() == True: encoded_input.to(device) with torch.no_grad(): model_output = model(**encoded_input) sentence_embeddings_tensor = self.mean_pooling( model_output, encoded_input['attention_mask']) embedding_all_text.append(sentence_embeddings_tensor) if torch.cuda.is_available() == True: del encoded_input del sentence_embeddings_tensor torch.cuda.empty_cache() sentence_embeddings = self.torch_to_array(embedding_all_text) return sentence_embeddings
def load_model(self, model_path): """Load all required models. model_path: path to model directory""" self.sentence_tokenizer = joblib.load( os.path.join(model_path, "sentence_tokenizer.joblib")) self.word_tokenizer = CamembertTokenizer.from_pretrained(model_path) self.sentence_embedding_model = TFCamembertForSentenceEmbedding.from_pretrained( model_path) model_path = os.path.join(model_path, "sentence_model") if os.path.exists(model_path + ".h5"): # HDF5 format self.sentence_model = tf.keras.models.load_model(model_path + ".h5") else: # SavedModel (TF) format self.sentence_model = tf.keras.models.load_model(model_path)
def make_tokenizer(self, text, voc_size, prefix, mtype='bpe', name="tokenizer_input"): ''' @path : chemin vers le fichier txt, de format un doc par ligne @voc_size : la taille souhaitée sur vocabulaire @prefix : le nom que l'on veut donner au modèle @mtype : le type de modèle, par exemple 'bpe' ''' os.chdir(self.path) path = self.make_input_tokenizer(text, self.path, name) if '/' in path: chemin = '/'.join([i for i in path.split('/')[:-1]]) chemin = chemin + '/' elif '\\' in path: chemin = '\\'.join([i for i in path.split('\\')[:-1]]) chemin = chemin + "\\" else: raise ValueError( 'La fonction ne parvient pas à trouver le chemin pour enregistrer le tokenizer, vérifier le chemin fourni, la variable path' ) import sentencepiece as spm #L'input doit être un fichier .txt FUES = spm.SentencePieceTrainer.train( input=path, #chemin vers le fichier txt, un doc par ligne vocab_size= voc_size, #taille du vocab, peut être augmenté, ne doit pas être trop grand par rapport aux mots des documents model_prefix= prefix, #nom du modèle, French Unsupervised Exctractive Summarizer model_type=mtype) from transformers import CamembertTokenizer tokenizer = CamembertTokenizer(chemin + prefix + '.model') return tokenizer
def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): super(CamemBERT, self).__init__() self.config_keys = ['max_seq_length', 'do_lower_case'] self.do_lower_case = do_lower_case if max_seq_length > 511: logging.warning( "CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511" ) max_seq_length = 511 self.max_seq_length = max_seq_length self.camembert = CamembertModel.from_pretrained(model_name_or_path) self.tokenizer = CamembertTokenizer.from_pretrained( model_name_or_path, do_lower_case=do_lower_case) self.cls_token_id = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.cls_token])[0] self.sep_token_id = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.sep_token])[0]