def create_data_loader(sentences, tokenizer_vocab, labels=[], train_mode=True): """ Create a dataloader BERT :param iterable sentences: text instances :param string tokenizer_vocab: tokenizer file :param iterable labels: sentiment or emotions class """ logger.info("Loading Tokenize and Encoding data..") tokenizer = BertTokenizer(tokenizer_vocab, do_lower_case=True) encoded_sents = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, return_attention_mask=True, padding=True, max_length=256, truncation=True, return_tensors="pt") sent_ids = encoded_sents["input_ids"] attention_masks = encoded_sents["attention_mask"] if len(labels) > 0: labels = torch.tensor(labels) data = TensorDataset(sent_ids, attention_masks, labels) else: data = TensorDataset(sent_ids, attention_masks) logger.info("Creating Data Loaders...") batch_size = int(CFG["MODELS"]["batch_size"]) if train_mode: dataloader = DataLoader(data, sampler=RandomSampler(data), batch_size=batch_size) else: dataloader = DataLoader(data, sampler=SequentialSampler(data), batch_size=batch_size) return dataloader
def evaluate_predict(data_loader, model): """ [summary] :param data_loader: [description] :type data_loader: [type] """ model.eval() predictions = [] progress_bar = tqdm(data_loader, desc="Prediction", leave=False, disable=False) for batch in progress_bar: batch = tuple(b.to(DEVICE) for b in batch) inputs = {"input_ids" : batch[0], "attention_mask": batch[1], } with torch.no_grad(): outputs = model(**inputs) logits = outputs[0] logits = logits.detach().cpu().numpy() predictions.append(logits) logger.info(str(progress_bar)) predictions = np.concatenate(predictions, axis=0) return predictions
def _main(): try: cfg = ConfigParser() cfg.read(CONF_INI) # Log conf log_file = os.path.join( cfg["LOGS"]["Path"], datetime.now().strftime('train_bert_%Y_%m_%d_%H_%M_%S.log')) logger.set_file_logs(level=logging.INFO, filename=log_file) bert_model = cfg["MODELS"]["bert_model"] bert_tokenizer = cfg["MODELS"]["bert_tokenizer"] data_labeled_path = cfg["INPUTS"]["data_train"] logger.info("Process labeled data...") data = sab.process(data_labeled_path) logger.info("Balance data...") data = process.balance_data(data, sab.POLR_ATTR) logger.info("Train BERT..") my_bert.fit(data, sab.CONT_ATTR, sab.LABL_ATTR, bert_model, bert_tokenizer, sab.DICT_LABEL) logger.info("Analyze sentiment done") except Exception: # pylint: disable=broad-except logger.exception("Process failed")
def performance_acc_class(preds, labels, labels_dict): """ Evaluate the model accuracy per class :param iterable preds: predicting values :param iterable labels: target values :param dict label_dict: class dicctionary """ preds_flat = np.argmax(preds, axis=1).flatten() labels_flat = labels.flatten() label_dict_inverse = {v: k for k, v in label_dict.items()} for label in np.unique(labels_flat): y_preds = preds_flat[labels_flat==label] y_true = labels_flat[labels_flat==label] logger.info(f'Class: {label_dict_inverse[label]}') logger.info(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')
def _main(): try: cfg = ConfigParser() cfg.read(CONF_INI) # Log conf log_file = os.path.join( cfg["LOGS"]["Path"], datetime.now().strftime('pred_%Y_%m_%d_%H_%M_%S.log')) logger.set_file_logs(level=logging.INFO, filename=log_file) bert_model = cfg["MODELS"]["bert_model"] bert_tokenizer = cfg["MODELS"]["bert_tokenizer"] senti_bert = cfg["OUT_MODELS"]["model_selected"] data_path = cfg["INPUTS"]["data_predict"] logger.info("Process labeled data...") data = pd.read_csv(data_path, sep=";", encoding="utf-8") sentences = data[TEXT_FIELD].values sentences = [tweeter.clean_text(sent) for sent in sentences] logger.info("BERT prediction..") labels_pred, probs = my_bert.predict(bert_model, senti_bert, bert_tokenizer, sentences, sab.DICT_LABEL) out_data = pd.DataFrame([labels_pred, probs]).T out_data.columns = [PROB_FIELD, LABEL_FIELD] out_data = pd.concat([data, out_data], axis=1) out_data.to_csv(cfg["OUTPUTS"]["data_predicted"], sep=";", index=False, encoding="utf-8") logger.info("BERT prediction done") except Exception: # pylint: disable=broad-except logger.exception("Process failed")
def fit(data, x_label, y_label, bert_model, tokenizer_vocab, label_dict): """ Train BERT model :param pandas data: info with text and labels :param iterable x_labels: input columns :param iterable y_labels: target columns :param BERT bert_model: binary model :param string tokenizer_vocab: tokenizer file :param dict label_dict: class dicctionary """ logger.info("Split Training and Test..") x_data = data[x_label] y_data = data[y_label] x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=TEST_SIZE, random_state=107, stratify=y_data) logger.info("Distribution Train Labels: ") logger.info(str(np.unique(y_train, return_counts=True))) logger.info("Distribution Test Labels: ") logger.info(str(np.unique(y_test, return_counts=True))) data_train = pd.concat([x_train, y_train], axis=1) data_test = pd.concat([x_test, y_test], axis=1) logger.info("Create dataloader for training...") data_load_train = create_data_loader(data_train[x_label].values, tokenizer_vocab, data_train[y_label].values) logger.info("Create dataloader for test...") data_load_test = create_data_loader(data_test[x_label].values, tokenizer_vocab, data_test[y_label].values, train_mode=False) logger.info("Setting up BERT Pretrained Model...") num_labels = len(np.unique(data_train[y_label].values)) model = BertForSequenceClassification.from_pretrained(bert_model, num_labels=num_labels, output_attentions=False, output_hidden_states=False) logger.info("Training...") epochs = int(CFG["MODELS"]["epochs"]) fine_tune(epochs, model, data_load_train, data_load_test, label_dict)
def fine_tune(epochs, model, data_loader_train, data_loader_val, label_dict): """ Fine-tune of BERT model using run_glue approach from HuggingFace """ seed_val = 17 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) torch.cuda.manual_seed_all(seed_val) # For GPU model.to(DEVICE) logger.info("Device: " + str(DEVICE)) label_values = list(label_dict.values()) target_names=list(map(str,label_values)) for epoch in tqdm(range(1, epochs+1)): model.train() loss_train_total = 0 progress_bar = tqdm(data_loader_train, desc="Epoch {:1d}".format(epoch), leave=False, disable=False) for step, batch in enumerate(progress_bar): model.zero_grad() # Each component of the batch is in the correct device batch = tuple(b.to(DEVICE) for b in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2] } outputs = model(**inputs) loss = outputs[0] loss_train_total += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer = set_optimizer(model, epochs, data_loader_train) scheduler = set_scheduler(optimizer, data_loader_train) optimizer.step() scheduler.step() logger.info(str(progress_bar) + str({"training_loss": "{:.3f}". \ format(loss.item()/len(batch))})) model_name = CFG["OUT_MODELS"]["dir"] + "/" + f"BERT_ft_epoch{epoch}.model" torch.save(model.state_dict(), model_name) tqdm.write("\nEpoch {epoch}") loss_train_avg = loss_train_total/len(data_loader_train) tqdm.write(f'Training loss: {loss_train_avg}') val_loss, predictions, true_values = evaluate(data_loader_val, model) predictions = np.argmax(predictions, axis=1).flatten() true_values = true_values.flatten() val_f1 = f1_score(true_values, predictions, average="weighted") accuracy_s = accuracy_score(true_values, predictions) precision_s = precision_score(true_values, predictions, average=None) recall_s = recall_score(true_values, predictions, average=None) conf_matrix = confusion_matrix(true_values, predictions, labels=label_values) report = classification_report(predictions, true_values, labels=label_values, target_names=target_names) tqdm.write(f"Validation loss: {val_loss}") tqdm.write(f"F1 Score (weighted): {val_f1}") logger.info("Classification Report:") logger.info(report) logger.info(f"F1 Score (weighted): {val_f1}") logger.info(f"Precision: {precision_s}") logger.info(f"Recall: {recall_s}") logger.info(f"Accuracy: {accuracy_s}") logger.info("Dict label: " + str(label_dict)) logger.info("Confusion Matrix: %s", str(conf_matrix))