Exemplo n.º 1
0
def create_data_loader(sentences, tokenizer_vocab, labels=[], train_mode=True):
    """
    Create a dataloader BERT
    :param iterable sentences: text instances
    :param string tokenizer_vocab: tokenizer file
    :param iterable labels: sentiment or emotions class
    """
    logger.info("Loading Tokenize and Encoding data..")
    tokenizer = BertTokenizer(tokenizer_vocab,
                              do_lower_case=True)
    encoded_sents = tokenizer.batch_encode_plus(sentences,
                                                add_special_tokens=True,
                                                return_attention_mask=True,
                                                padding=True,
                                                max_length=256,
                                                truncation=True,
                                                return_tensors="pt")
    sent_ids = encoded_sents["input_ids"]
    attention_masks = encoded_sents["attention_mask"]
    if len(labels) > 0:
        labels = torch.tensor(labels)
        data = TensorDataset(sent_ids, attention_masks, labels)
    else:
        data = TensorDataset(sent_ids, attention_masks)
    logger.info("Creating Data Loaders...")
    batch_size = int(CFG["MODELS"]["batch_size"])
    if train_mode:
        dataloader = DataLoader(data,
                                sampler=RandomSampler(data),
                                batch_size=batch_size)
    else:
        dataloader = DataLoader(data,
                                sampler=SequentialSampler(data),
                                batch_size=batch_size)
    return dataloader
Exemplo n.º 2
0
def evaluate_predict(data_loader, model):
    """
    [summary]
    :param data_loader: [description]
    :type data_loader: [type]
    """
    model.eval()
    predictions = []
    progress_bar = tqdm(data_loader,
                    desc="Prediction",
                    leave=False,
                    disable=False)
    for batch in progress_bar:
        batch = tuple(b.to(DEVICE) for b in batch)
        inputs = {"input_ids" : batch[0],
                  "attention_mask": batch[1],
                  }
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs[0]
        logits = logits.detach().cpu().numpy()
        predictions.append(logits)
        logger.info(str(progress_bar))

    predictions = np.concatenate(predictions, axis=0)
    return predictions
def _main():
    try:
        cfg = ConfigParser()
        cfg.read(CONF_INI)

        # Log conf
        log_file = os.path.join(
            cfg["LOGS"]["Path"],
            datetime.now().strftime('train_bert_%Y_%m_%d_%H_%M_%S.log'))

        logger.set_file_logs(level=logging.INFO, filename=log_file)
        bert_model = cfg["MODELS"]["bert_model"]
        bert_tokenizer = cfg["MODELS"]["bert_tokenizer"]
        data_labeled_path = cfg["INPUTS"]["data_train"]
        logger.info("Process labeled data...")
        data = sab.process(data_labeled_path)
        logger.info("Balance data...")
        data = process.balance_data(data, sab.POLR_ATTR)
        logger.info("Train BERT..")
        my_bert.fit(data, sab.CONT_ATTR, sab.LABL_ATTR, bert_model,
                    bert_tokenizer, sab.DICT_LABEL)
        logger.info("Analyze sentiment done")

    except Exception:  # pylint: disable=broad-except
        logger.exception("Process failed")
Exemplo n.º 4
0
def performance_acc_class(preds, labels, labels_dict):
    """
    Evaluate the model accuracy per class
    :param iterable preds: predicting values
    :param iterable labels: target values
    :param dict label_dict: class dicctionary
    """
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        logger.info(f'Class: {label_dict_inverse[label]}')
        logger.info(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}')
Exemplo n.º 5
0
def _main():
    try:
        cfg = ConfigParser()
        cfg.read(CONF_INI)

        # Log conf
        log_file = os.path.join(
            cfg["LOGS"]["Path"],
            datetime.now().strftime('pred_%Y_%m_%d_%H_%M_%S.log'))

        logger.set_file_logs(level=logging.INFO, filename=log_file)
        bert_model = cfg["MODELS"]["bert_model"]
        bert_tokenizer = cfg["MODELS"]["bert_tokenizer"]
        senti_bert = cfg["OUT_MODELS"]["model_selected"]
        data_path = cfg["INPUTS"]["data_predict"]
        logger.info("Process labeled data...")
        data = pd.read_csv(data_path, sep=";", encoding="utf-8")
        sentences = data[TEXT_FIELD].values
        sentences = [tweeter.clean_text(sent) for sent in sentences]
        logger.info("BERT prediction..")
        labels_pred, probs = my_bert.predict(bert_model, senti_bert,
                                             bert_tokenizer, sentences,
                                             sab.DICT_LABEL)
        out_data = pd.DataFrame([labels_pred, probs]).T
        out_data.columns = [PROB_FIELD, LABEL_FIELD]
        out_data = pd.concat([data, out_data], axis=1)
        out_data.to_csv(cfg["OUTPUTS"]["data_predicted"],
                        sep=";",
                        index=False,
                        encoding="utf-8")
        logger.info("BERT prediction done")

    except Exception:  # pylint: disable=broad-except
        logger.exception("Process failed")
Exemplo n.º 6
0
def fit(data, x_label, y_label, bert_model, tokenizer_vocab, label_dict):
    """
    Train BERT model
    :param pandas data: info with text and labels
    :param iterable x_labels: input columns
    :param iterable y_labels: target columns
    :param BERT bert_model: binary model
    :param string tokenizer_vocab: tokenizer file
    :param dict label_dict: class dicctionary
    """
    logger.info("Split Training and Test..")
    x_data = data[x_label]
    y_data = data[y_label]
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=TEST_SIZE,
                                                        random_state=107, stratify=y_data)
    logger.info("Distribution Train Labels: ")
    logger.info(str(np.unique(y_train, return_counts=True)))
    logger.info("Distribution Test Labels: ")
    logger.info(str(np.unique(y_test, return_counts=True)))
    data_train = pd.concat([x_train, y_train], axis=1)
    data_test = pd.concat([x_test, y_test], axis=1)
    logger.info("Create dataloader for training...")
    data_load_train = create_data_loader(data_train[x_label].values,
                                         tokenizer_vocab,
                                         data_train[y_label].values)
    logger.info("Create dataloader for test...")
    data_load_test = create_data_loader(data_test[x_label].values,
                                        tokenizer_vocab,
                                        data_test[y_label].values,
                                        train_mode=False)
    logger.info("Setting up BERT Pretrained Model...")
    num_labels = len(np.unique(data_train[y_label].values))
    model = BertForSequenceClassification.from_pretrained(bert_model,
                                                          num_labels=num_labels,
                                                          output_attentions=False,
                                                          output_hidden_states=False)
    logger.info("Training...")
    epochs = int(CFG["MODELS"]["epochs"])
    fine_tune(epochs, model, data_load_train, data_load_test, label_dict)
Exemplo n.º 7
0
def fine_tune(epochs, model, data_loader_train, data_loader_val, label_dict):
    """
    Fine-tune of BERT model using run_glue
    approach from HuggingFace
    """
    seed_val = 17
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)           # For GPU
    model.to(DEVICE)
    logger.info("Device: " + str(DEVICE))
    label_values = list(label_dict.values())
    target_names=list(map(str,label_values))
    for epoch in tqdm(range(1, epochs+1)):
        model.train()
        loss_train_total = 0
        progress_bar = tqdm(data_loader_train,
                            desc="Epoch {:1d}".format(epoch),
                            leave=False,
                            disable=False)
        for step, batch in enumerate(progress_bar):
            model.zero_grad()
            # Each component of the batch is in the correct device
            batch = tuple(b.to(DEVICE) for b in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[2]
            }
            outputs = model(**inputs)
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer = set_optimizer(model, epochs, data_loader_train)
            scheduler = set_scheduler(optimizer, data_loader_train)
            optimizer.step()
            scheduler.step()
            logger.info(str(progress_bar) + str({"training_loss": "{:.3f}". \
                        format(loss.item()/len(batch))}))

        model_name = CFG["OUT_MODELS"]["dir"] + "/" + f"BERT_ft_epoch{epoch}.model"
        torch.save(model.state_dict(), model_name)
        tqdm.write("\nEpoch {epoch}")
        loss_train_avg = loss_train_total/len(data_loader_train)
        tqdm.write(f'Training loss: {loss_train_avg}')

        val_loss, predictions, true_values = evaluate(data_loader_val, model)
        predictions = np.argmax(predictions, axis=1).flatten()
        true_values = true_values.flatten()
        val_f1 = f1_score(true_values, predictions, average="weighted")
        accuracy_s = accuracy_score(true_values, predictions)
        precision_s = precision_score(true_values, predictions, average=None)
        recall_s = recall_score(true_values, predictions, average=None)
        conf_matrix = confusion_matrix(true_values, predictions, labels=label_values)
        report = classification_report(predictions, true_values, labels=label_values,
                                       target_names=target_names)
        tqdm.write(f"Validation loss: {val_loss}")
        tqdm.write(f"F1 Score (weighted): {val_f1}")
        logger.info("Classification Report:")
        logger.info(report)
        logger.info(f"F1 Score (weighted): {val_f1}")
        logger.info(f"Precision: {precision_s}")
        logger.info(f"Recall: {recall_s}")
        logger.info(f"Accuracy: {accuracy_s}")
        logger.info("Dict label: " + str(label_dict))
        logger.info("Confusion Matrix: %s", str(conf_matrix))