def predict_camembert(df: pd.DataFrame) -> pd.DataFrame: """ predict the sentiment of reviews :param df: dataframe with reviews :return: dataframe: dataframe with prediction of reviews """ df['space'] = ' ' df['comments'] = df[['titre', 'space', 'comment']].fillna('').sum(axis=1) df = df.dropna(subset=['comments'], axis="rows") comments = df['comments'].to_list() # camemBERT state_dict = torch.load("camemBERT_38000_state_dict.pt", map_location=torch.device('cpu')) model = CamembertForSequenceClassification.from_pretrained( 'camembert-base', num_labels=2, state_dict=state_dict) # Initialize CamemBERT tokenizer tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True) # Encode the comments tokenized_comments_ids = [ tokenizer.encode(comment, add_special_tokens=True, max_length=MAX_LEN) for comment in comments ] # Pad the resulted encoded comments tokenized_comments_ids = pad_sequences(tokenized_comments_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post") # Create attention masks attention_masks = [] for seq in tokenized_comments_ids: seq_mask = [float(i > 0) for i in seq] attention_masks.append(seq_mask) prediction_inputs = torch.tensor(tokenized_comments_ids) prediction_masks = torch.tensor(attention_masks) predictions = [] with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(prediction_inputs.to(device), token_type_ids=None, attention_mask=prediction_masks.to(device)) logits = outputs[0] logits = logits.detach().cpu().numpy() predictions.extend(np.argmax(logits, axis=1).flatten()) df = pd.DataFrame( data={ "site": df["site"], "date": df["date"], "review": df["review"], "sentiment": predictions }) return df
def __init__(self, DIR, filename): self.path = os.path.join(DIR, filename) self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.tokenizer = CamembertTokenizer.from_pretrained( config["BERT_MODEL"]) classifier = CamembertForSequenceClassification.from_pretrained( config['BERT_MODEL'], num_labels=len(config["CLASS_NAMES"])) classifier.load_state_dict( torch.load(self.path, map_location=self.device)) classifier = classifier.eval() self.classifier = classifier.to(self.device)
def init_nlp(self, model_path="model_nlp.pt"): try: nlp = spacy.load("fr_core_news_sm") except: os.system("python -m spacy download fr") os.system("python -m spacy download fr_core_news_md") # load model camembert state_dict = torch.load(model_path, map_location=torch.device('cpu')) #print("Loading trained model...") model = CamembertForSequenceClassification.from_pretrained( 'camembert-base', num_labels=2, state_dict=state_dict) #print("Trained model loaded!") # load TOKENIZER camembert TOKENIZER = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True) return model, TOKENIZER
batch_size = 32 train_dataloader = DataLoader( train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size) validation_dataloader = DataLoader( validation_dataset, sampler = SequentialSampler(validation_dataset), batch_size = batch_size) #%% model = CamembertForSequenceClassification.from_pretrained( 'camembert-base', num_labels = 2) #%% optimizer = AdamW(model.parameters(), lr = 2e-5, # Learning Rate, plus petit pour éviter le # catastroph forgetting eps = 1e-8) # Epsilon epochs = 3 # On va stocker nos tensors sur mon cpu : je n'ai pas mieux device = torch.device("cpu") # Pour enregistrer les stats a chaque epoque training_stats = [] #%% for epoch in range(0, epochs):
input_ids, labels, attention_masks, token_type_ids, lineNumbers, file_names = \ input_ids_tmp, labels_tmp, attention_masks_tmp, token_type_ids_tmp, lineNumbers_tmp, file_names_tmp else: input_ids = np.append(input_ids, input_ids_tmp, axis=0) labels = np.append(labels, labels_tmp, axis=0) attention_masks = np.append(attention_masks, attention_masks_tmp, axis=0) token_type_ids = np.append(token_type_ids, token_type_ids_tmp, axis=0) lineNumbers = np.append(lineNumbers, lineNumbers_tmp, axis=0) file_names = np.append(file_names, file_names_tmp, axis=0) print('Loading BERT model...') if curr_lang == 'French': model = CamembertForSequenceClassification.from_pretrained(output_dir) else: model = BertForSequenceClassification.from_pretrained(output_dir) if torch.cuda.is_available(): device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) else: print('No GPU available, using the CPU instead.')
# Create the DataLoader for our validation set. validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size) ##### loading the model and beging the training print('loading the model...') model = CamembertForSequenceClassification.from_pretrained( 'camembert-base', # Use the 12-layer BERT model, with an uncased vocab. num_labels=len( index2canonical ), # The number of output labels for multi-class classification. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) # train the model on GPU model.cuda() # Note: AdamW is a class from the huggingface library (as opposed to pytorch) optimizer = AdamW( model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) # Number of training epochs
def load_class(self): # Load the tokenizer. if self.verbose == True: print('Loading {} class...'.format(self.model_name)) if self.model_name == 'bert': # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. self.model = BertForSequenceClassification.from_pretrained( self. model_type, # Use the 12-layer BERT model, with an uncased vocab. # You can increase this for multi-class tasks. num_labels=self.num_labels, output_attentions= False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) if self.model_name == 'distilbert': self.model = DistilBertForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'albert': self.model = AlbertForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'bart': if self.task == 'classification': self.model = BartForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.task == 'summarize': self.model = BartForConditionalGeneration.from_pretrained( self.model_type) if self.model_name == 'xlnet': self.model = XLNetForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'roberta': self.model = RobertaForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'camenbert': self.model = CamembertForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'flaubert': self.model = FlaubertForSequenceClassification.from_pretrained( self.model_type, num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, ) if self.model_name == 'gpt2': self.model = GPT2LMHeadModel.from_pretrained(self.model_type)
def __init__(self, bert_model, num_classes): super(Umberto, self).__init__() self.encoder = CamembertForSequenceClassification.from_pretrained( bert_model, num_labels=num_classes)