示例#1
0
def download_and_extract(gdrive_ids):
    """
    Downloads from gdrive and extracts
    :param gdrive_ids: list of tuples (<gdrive id>, <destination dir>, <output file name>)
    :return:
    """
    for doc_id, directory, output_name in gdrive_ids:
        download_gdrive_docs([(doc_id, directory, output_name)])
        file_path = os.path.join(directory, output_name)

        logger.info(f"Extracting {file_path}")
        with tarfile.open(file_path) as my_tar:
            my_tar.extractall(directory)
示例#2
0
def download_gdrive_docs(gdrive_ids):
    """
    Downloads a list of documents from google drive
    :param gdrive_ids: list of tuples (<gdrive id>, <destination dir>, <output file name>)
    :return:
    """
    for doc_id, directory, output_name in gdrive_ids:
        # only downloads if file does not exist
        output_filepath = os.path.join(directory, output_name)
        if not os.path.isfile(output_filepath):
            logger.info(f"Downloading {output_filepath}")
            url = f"https://drive.google.com/uc?id={doc_id}"
            gdown.download(url, output_filepath, quiet=False)
示例#3
0
def print_classification(sentences, tps):
    """
    Print the classification
    :param sentences: the sentences that were classifier
    :param tps: tuples with the classifications
    :return:
    """
    for i in range(len(sentences)):
        logger.info(sentences[i])
        tp = tps[i]
        logger.info(f"Level 1: {tp[0]}")
        logger.info(f"Level 2: {tp[1]}")
        logger.info(f"Level 3: {tp[2]}")
        logger.info("")
示例#4
0
def download_xlnet_data(full=True):
    """
    Download data for xlnet training
    :param full: whether to download all files or just the vocabulary
    :return:
    """
    path_base = "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased"

    if full:
        filenames = ["spiece.model", "pytorch_model.bin", "config.json"]
    else:
        filenames = ["spiece.model"]

    for name in filenames:
        logger.info(f"Downloading {name} for xlnet")
        out_path = os.path.join(XLNET_BASE_PATH, name)

        if not os.path.isfile(out_path):
            wget.download(f"{path_base}-{name}", out=out_path)
示例#5
0
    def evaluate_model(self,
                       dataloader,
                       output_results_filename="eval_results.txt"):
        """
        Evaluates the model
        :param model_folder: the folder of the model
        :param model_name: the name of the model
        :return:
        """

        eval_general_metrics, class_report = self.get_reports(dataloader)
        results_string = self.get_printed_eval_results(eval_general_metrics,
                                                       class_report)

        # Save the file report
        output_eval_file = os.path.join(self.model_folder,
                                        output_results_filename)
        logger.info(results_string)

        with open(output_eval_file, "w") as writer:
            writer.write(results_string)
示例#6
0
def main(mode):
    logger.info(f"Downloading data for mode {mode}")

    # download the main datasets in all modes
    download_gdrive_docs(GDRIVE_IDS["main_dataset"])

    # Download the XLNet vocabulary
    download_xlnet_data(full=False)

    if mode == "prediction_models_single":
        download_and_extract(GDRIVE_IDS["predict_single"])
    elif mode == "prediction_models_multi":
        download_and_extract(GDRIVE_IDS["predict_multi"])
    elif mode == "train_data_single":
        download_xlnet_data()
        download_gdrive_docs(GDRIVE_IDS["train_single"])
    elif mode == "train_data_multi":
        download_xlnet_data()
        download_gdrive_docs(GDRIVE_IDS["train_multi"])

    logger.info("Finished downloading all the data")
示例#7
0
    def get_predictions(self, sentences):
        """
        Get the string predictions for each sentence
        :param sentences: the sentences
        :return: a dataframe containing the sentences and the predictions
        """
        """
        Makes prediction on sentences
        :param sentences: the sentences
        :return: a dataframe a dataframe with sentences and predictions
        """
        self.tag2idx = get_existing_tag2idx(self.model_folder)
        tag2name = {self.tag2idx[key]: key for key in self.tag2idx.keys()}

        model = XLNetForSequenceClassification.from_pretrained(
            self.model_folder, num_labels=len(tag2name))
        model.to(self.device)
        model.eval()

        logger.info("Setting input embedding")

        input, masks, segs = generate_dataloader_input(sentences)
        dataloader = get_dataloader(input, masks, segs, BATCH_NUM)

        nb_eval_steps, nb_eval_examples = 0, 0

        y_predict = []
        logger.info("Running evaluation...")

        for step, batch in enumerate(dataloader):
            if nb_eval_steps % 100 == 0:
                logger.info(f"Step {nb_eval_steps}")

            batch = tuple(t.to(self.device) for t in batch)
            b_input_ids, b_input_mask, b_segs = batch

            with torch.no_grad():
                outputs = model(
                    input_ids=b_input_ids,
                    token_type_ids=b_segs,
                    input_mask=b_input_mask,
                )
                logits = outputs[0]

            # Get text classification predict result
            logits = logits.detach().cpu().numpy()

            for predict in np.argmax(logits, axis=1):
                y_predict.append(predict)

            nb_eval_steps += 1

        final_df = pd.DataFrame({
            "sentences": sentences,
            "label": [tag2name[pred] for pred in y_predict],
            "y_pred": y_predict
        })

        return final_df
示例#8
0
    def save_model(self):
        if not self.model:
            raise IOError(f"{tag2idx_file} No model to save")

        # Make save folder if it does not exists
        if not os.path.exists(self.model_folder):
            os.makedirs(self.model_folder)

        # Save a trained model and configuration
        model_to_save = (self.model.module
                         if hasattr(self.model, "module") else self.model)

        output_model_file = os.path.join(self.model_folder, PYTORCH_MODEL_NAME)
        output_config_file = os.path.join(self.model_folder, CONFIG_FILENAME)
        tag2idx_file = os.path.join(self.model_folder, TAG2IDX_FILENAME)

        logger.info("Saving the model...")

        # Save model into file
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)

        # save tag2idx pickle
        torch.save(self.tag2idx, tag2idx_file)
示例#9
0
    def split_into_dataloaders(self, input_csv):
        """
        :param input_csv: csv with two columns named text and labels
        :return:
        """
        logger.info("Loading data from csv...")

        df = pd.read_csv(input_csv)

        sentences = df.text.to_list()
        labels = df.labels.to_list()

        self.tag2idx = {t: i for i, t in enumerate(set(labels))}

        logger.info("Setting input embedding...")

        full_input_ids, full_input_masks, full_segment_ids, tags = generate_dataloader_input(
            sentences, labels, self.tag2idx)

        # split the data
        (
            tr_inputs,
            val_inputs,
            tr_tags,
            val_tags,
            tr_masks,
            val_masks,
            tr_segs,
            val_segs,
        ) = train_test_split(
            full_input_ids,
            tags,
            full_input_masks,
            full_segment_ids,
            random_state=4,
            test_size=0.3,
        )

        logger.info("Getting dataloaders...")

        train_dataloader = get_dataloader(tr_inputs, tr_masks, tr_segs,
                                          BATCH_NUM, tr_tags)
        valid_dataloader = get_dataloader(val_inputs, val_masks, val_segs,
                                          BATCH_NUM, val_tags)

        return train_dataloader, valid_dataloader
示例#10
0
    def train_model(self, train_dataloader, epochs=5, max_grad_norm=1.0):
        """
        :param save_folder: where to save the model
        :param model_name: the name the model will receive
        :param save_model:  whether to save the model or not
        :return:
        """
        logger.info("Preparing for training...")

        self.model = XLNetForSequenceClassification.from_pretrained(
            XLNET_BASE_PATH, num_labels=len(self.tag2idx))

        self.model.to(self.device)
        if self.n_gpu > 1:
            self.model = torch.nn.DataParallel(self.model)

        # Calculate train optimization num
        num_train_optimization_steps = (
            int(math.ceil(len(train_dataloader.dataset) / BATCH_NUM) / 1) *
            epochs)

        # Fine tune model all layer parameters

        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "gamma", "beta"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate":
                0.01,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay_rate":
                0.0,
            },
        ]

        optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

        self.model.train()

        logger.info("----- Running training -----")
        logger.info("  Num examples = %d" % (len(train_dataloader.dataset)))
        logger.info("  Batch size = %d" % (BATCH_NUM))
        logger.info("  Num steps = %d" % (num_train_optimization_steps))
        for i in trange(epochs, desc="Epoch"):
            self.tr_loss = 0
            self.nb_tr_steps = 0
            for step, batch in enumerate(train_dataloader):
                if self.nb_tr_steps % 100 == 0:
                    logger.info(f"Step {self.nb_tr_steps}")
                # add batch to gpu
                batch = tuple(t.to(self.device) for t in batch)
                b_input_ids, b_input_mask, b_segs, b_labels = batch

                # forward pass
                outputs = self.model(
                    input_ids=b_input_ids,
                    token_type_ids=b_segs,
                    input_mask=b_input_mask,
                    labels=b_labels,
                )
                loss, logits = outputs[:2]
                if self.n_gpu > 1:
                    # When multi gpu, average it
                    loss = loss.mean()

                # backward pass
                loss.backward()

                # track train loss
                self.tr_loss += loss.item()
                self.nb_tr_steps += 1

                # gradient clipping
                torch.nn.utils.clip_grad_norm_(
                    parameters=self.model.parameters(), max_norm=max_grad_norm)

                # update parameters
                optimizer.step()
                optimizer.zero_grad()

            # print train loss per epoch
            logger.info("Epoch: {}".format(i))
            logger.info("Train loss: {}".format(self.tr_loss /
                                                self.nb_tr_steps))
示例#11
0
    def get_reports(self, dataloader):
        logger.info("Loading model for evaluation...")

        if not self.tag2idx:
            self.tag2idx = get_existing_tag2idx(self.model_folder)

        model = XLNetForSequenceClassification.from_pretrained(
            self.model_folder, num_labels=len(self.tag2idx))
        model.to(self.device)
        if self.n_gpu > 1:
            model = torch.nn.DataParallel(model)

        model.eval()

        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        y_true = []
        y_predict = []

        logger.info("----- Running evaluation -----")
        logger.info("  Num examples ={}".format(len(dataloader.dataset)))
        logger.info("  Batch size = {}".format(BATCH_NUM))
        for step, batch in enumerate(dataloader):
            if self.nb_tr_steps % 100 == 0:
                logger.info(f"Step {self.nb_tr_steps}")

            batch = tuple(t.to(self.device) for t in batch)
            b_input_ids, b_input_mask, b_segs, b_labels = batch

            with torch.no_grad():
                outputs = model(
                    input_ids=b_input_ids,
                    token_type_ids=b_segs,
                    input_mask=b_input_mask,
                    labels=b_labels,
                )
                tmp_eval_loss, logits = outputs[:2]

            # Get predictions
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to("cpu").numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            # Save predictions and gold labels
            for predict in np.argmax(logits, axis=1):
                y_predict.append(predict)

            for real_result in label_ids.tolist():
                y_true.append(real_result)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / len(dataloader.dataset)

        eval_general_metrics = {
            "eval_loss": eval_loss,
            "eval_accuracy": eval_accuracy
        }

        # if the object has the training model, we can extract th training loss
        if self.tr_loss:
            loss = self.tr_loss / self.nb_tr_steps
            eval_general_metrics["loss"] = loss

        class_report = classification_report(y_pred=np.array(y_predict),
                                             y_true=np.array(y_true))

        return eval_general_metrics, class_report