예제 #1
0
def make_figure_three(
    *, args, topn_results_obs, topn_results_counter_diss, topn_results_counter_suff
):
    topn_results_obs = load_from_pickle(args.results / RESULTS_OBS_FILE)
    topn_results_counter_suff = load_from_pickle(
        args.results / RESULTS_CF_SUFFICIENCY_FILE
    )

    x = sum(topn_results_obs) / len(topn_results_obs)
    y = sum(topn_results_counter_suff) / len(topn_results_counter_suff)
    plt.figure(figsize=(5, 4))

    p_obs = sum(topn_results_obs) / len(topn_results_obs)
    error_obs = np.sqrt(p_obs * (1 - p_obs) / len(topn_results_obs))
    p_counter = sum(topn_results_counter_suff) / len(topn_results_counter_suff)
    error_counter = np.sqrt(
        p_counter * (1 - p_counter) / len(topn_results_counter_suff)
    )

    xmarks = [i + 1 for i in range(len(p_obs))]

    plt.plot(xmarks, 1 - p_obs, label="Associative", color="blue")
    plt.plot(xmarks, 1 - p_counter, label="Counterfactual", color="seagreen")
    plt.plot(xmarks, 1 - (1 - y) / (1 - x), linestyle="--", color="black")

    plt.fill_between(
        xmarks,
        1 - p_obs - 2 * error_obs,
        1 - p_obs + 2 * error_obs,
        alpha=0.2,
        edgecolor="#1B2ACC",
        facecolor="#089FFF",
        linewidth=0,
        linestyle="None",
        antialiased=True,
    )

    plt.fill_between(
        xmarks,
        1 - p_counter - 2 * error_counter,
        1 - p_counter + 2 * error_counter,
        alpha=0.2,
        edgecolor="#1B2ACC",
        facecolor="seagreen",
        linewidth=0,
        linestyle="None",
        antialiased=True,
    )

    plt.xticks([i + 1 for i in range(16)])
    plt.xlim(1, 15)
    plt.ylim(0, 0.5)
    plt.show()
    plt.savefig(args.results / "algo_vs_algo.pdf")
예제 #2
0
def preprocess_data(train_files, config):
    x_train, y_train, x_dev, y_dev = [], [], [], []
    for file in tqdm(train_files, desc="preprocessing_data"):
        cache_file_path = file.replace("data", "cache").replace(
            ".csv", "_fw{}_pw{}_pad{}_ts{}_cache.pickle".format(
                config["feature_window_size"],
                config["prediction_window_size"], config["pad_size"],
                config["test_size"]))
        if os.path.exists(cache_file_path):
            print("Load cache from {}".format(cache_file_path))
            x_window_train, y_window_train, x_window_dev, y_window_dev = utils.load_from_pickle(
                cache_file_path)
        else:
            x_train_single, y_train_single, x_dev_single, y_dev_single = read_data_from_file(
                file, config["test_size"])
            x_window_train, y_window_train, x_window_dev, y_window_dev = \
                prepare_data(x_train_single, y_train_single, x_dev_single, y_dev_single, config)
            print("Save cache to {}".format(cache_file_path))
            utils.save_to_pickle(
                (x_window_train, y_window_train, x_window_dev, y_window_dev),
                cache_file_path)
        x_train.extend(x_window_train)
        y_train.extend(y_window_train)
        x_dev.extend(x_window_dev)
        y_dev.extend(y_window_dev)
    return x_train, y_train, x_dev, y_dev
예제 #3
0
def get_bot_response():
    user_text = request.args.get('msg')

    custom_answer = find_custom_answer(user_text, threshold=COS_SIM_THRESHOLD)
    if custom_answer:
        reply_text = custom_answer
    else:
        helper_data = load_from_pickle(TMP_FILENAME_FOR_DIALOGUE_HELPER_DATA)
        restart_dialogue = helper_data["restart_dialogue"]
        chat_history_ids = helper_data["chat_history_ids"]

        if any(w == user_text.strip().lower() for w in restart_keywords):
            reply_text = "Ok, let's start from scratch, I am ready"
            restart_dialogue = True
        elif any(w == user_text.strip().lower() for w in exit_keywords):
            reply_text = "Ok, bye! Just waiting if you type something..."
            restart_dialogue = True
        else:
            reply_text, chat_history_ids = dialog_gpt(user_text,
                                                      chat_history_ids,
                                                      restart_dialogue)
            restart_dialogue = False

        helper_data = {
            "restart_dialogue": restart_dialogue,
            "chat_history_ids": chat_history_ids
        }
        save_to_pickle(helper_data, TMP_FILENAME_FOR_DIALOGUE_HELPER_DATA)

    return reply_text
예제 #4
0
    def _load(self):
        """ Load previous robot objects, currently using pickle. """

        try:
            robot_list = load_from_pickle()
        except IOError:
            pass # Just pass, self.robots need not be modified
        else:
            # Update instead of assignmet in case loading somehow happens after some bots are already created
            self.robots.update({bot.name: bot for bot in robot_list})
예제 #5
0
    def __init__(self,
                 logger,
                 config,
                 data_name,
                 data_path,
                 embed_path=None,
                 user_dict=None,
                 vocab_path=None,
                 stop_word=None,
                 max_len=50,
                 query_max_len=20,
                 target_max_len=20,
                 test_split=0.0,
                 training=True):
        self.logger = logger
        self.reset = config.reset
        self._data_dir = Path('data') / data_name

        self.query_max_len = query_max_len
        self.target_max_len = target_max_len
        self.max_len = max_len

        if training:
            embedding_path = self._data_dir / embed_path
            print(embedding_path.absolute())
            self._embedding = Embedding(str(embedding_path), logger=logger)

        print(
            f"Begin to build segment and ..... feature engnieer .... ngram ....."
        )
        self._segment = Segment_jieba(user_dict=str(self._data_dir /
                                                    user_dict))

        if training:
            print(f"Begin to build vocab")
            self._vocab = Vocab(str(self._data_dir / 'RAW' / vocab_path),
                                self._segment, self._embedding)
            self.word2idx, self.idx2word = self._vocab.word2idx, self._vocab.idx2word
            dump_to_pickle(str(self._data_dir / 'vocab.pkl'),
                           (self.word2idx, self.idx2word), self.reset)
        else:
            print(f"load the vocab")
            (self.word2idx, self.idx2word) = load_from_pickle(
                str(self._data_dir / 'vocab.pkl'))

        self.vocab_size = len(self.word2idx)
        if training:
            filename = str(self._data_dir / 'RAW' / data_path)
            # train_test_split and exist
            self._get_train_and_test(filename, test_split)
예제 #6
0
def produce_results(*, args):
    print(f"> Producing results from {str(args.results.absolute())}")

    topn_results_obs = load_from_pickle(args.results / RESULTS_OBS_FILE)
    topn_results_counter_diss = load_from_pickle(
        args.results / RESULTS_CF_DISSABLEMENT_FILE
    )
    topn_results_counter_suff = load_from_pickle(
        args.results / RESULTS_CF_SUFFICIENCY_FILE
    )

    make_supplementary_table_one(
        args=args,
        topn_results_obs=topn_results_obs,
        topn_results_counter_diss=topn_results_counter_diss,
        topn_results_counter_suff=topn_results_counter_suff,
    )
    make_figure_three(
        args=args,
        topn_results_obs=topn_results_obs,
        topn_results_counter_diss=topn_results_counter_diss,
        topn_results_counter_suff=topn_results_counter_suff,
    )
    make_table_one_and_supplementary_table_two(
        args=args,
        topn_results_obs=topn_results_obs,
        topn_results_counter_diss=topn_results_counter_diss,
        topn_results_counter_suff=topn_results_counter_suff,
    )
    df_results, doc_topn = make_supplementary_table_three(
        args=args,
        topn_results_obs=topn_results_obs,
        topn_results_counter_diss=topn_results_counter_diss,
        topn_results_counter_suff=topn_results_counter_suff,
    )
    make_table_two(args=args, df_results=df_results, doc_topn=doc_topn)
    make_figure_four(args=args, df_results=df_results)
def main():
    # Read all the data instances
    task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle(
        args.data_file)
    data, subtasks_list = get_multitask_instances_for_valid_tasks(
        task_instances_dict, tag_statistics)

    if args.retrain:
        logging.info("Creating and training the model from 'bert-base-cased' ")
        # Create the save_directory if not exists
        make_dir_if_not_exists(args.save_directory)

        # Initialize tokenizer and model with pretrained weights
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        config = BertConfig.from_pretrained('bert-base-cased')
        config.subtasks = subtasks_list
        # print(config)
        model = MultiTaskBertForCovidEntityClassification.from_pretrained(
            'bert-base-cased', config=config)

        # Add new tokens in tokenizer
        new_special_tokens_dict = {
            "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"]
        }
        # new_special_tokens_dict = {"additional_special_tokens": ["<E>", "</E>"]}
        tokenizer.add_special_tokens(new_special_tokens_dict)

        # Add the new embeddings in the weights
        print("Embeddings type:",
              model.bert.embeddings.word_embeddings.weight.data.type())
        print("Embeddings shape:",
              model.bert.embeddings.word_embeddings.weight.data.size())
        embedding_size = model.bert.embeddings.word_embeddings.weight.size(1)
        new_embeddings = torch.FloatTensor(
            len(new_special_tokens_dict["additional_special_tokens"]),
            embedding_size).uniform_(-0.1, 0.1)
        # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1)
        print("new_embeddings shape:", new_embeddings.size())
        new_embedding_weight = torch.cat(
            (model.bert.embeddings.word_embeddings.weight.data,
             new_embeddings), 0)
        model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight
        print("Embeddings shape:",
              model.bert.embeddings.word_embeddings.weight.data.size())
        # Update model config vocab size
        model.config.vocab_size = model.config.vocab_size + len(
            new_special_tokens_dict["additional_special_tokens"])
    else:
        # Load the tokenizer and model from the save_directory
        tokenizer = BertTokenizer.from_pretrained(args.save_directory)
        model = MultiTaskBertForCovidEntityClassification.from_pretrained(
            args.save_directory)
        # print(model.state_dict().keys())
        # TODO save and load the subtask classifier weights separately
        # Load from individual state dicts
        for subtask in model.subtasks:
            model.classifiers[subtask].load_state_dict(
                torch.load(
                    os.path.join(args.save_directory,
                                 f"{subtask}_classifier.bin")))
        # print(model.config)
        # exit()
    model.to(device)
    # Explicitly move the classifiers to device
    for subtask, classifier in model.classifiers.items():
        classifier.to(device)
    entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0]

    logging.info(
        f"Task dataset for task: {args.task} loaded from {args.data_file}.")

    model_config = dict()
    results = dict()

    # Split the data into train, dev and test and shuffle the train segment
    train_data, dev_data, test_data = split_multitask_instances_in_train_dev_test(
        data)
    random.shuffle(train_data)  # shuffle happens in-place
    logging.info("Train Data:")
    total_train_size, pos_subtasks_train_size, neg_subtasks_train_size = log_multitask_data_statistics(
        train_data, model.subtasks)
    logging.info("Dev Data:")
    total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics(
        dev_data, model.subtasks)
    logging.info("Test Data:")
    total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics(
        test_data, model.subtasks)
    logging.info("\n")
    model_config["train_data"] = {
        "size": total_train_size,
        "pos": pos_subtasks_train_size,
        "neg": neg_subtasks_train_size
    }
    model_config["dev_data"] = {
        "size": total_dev_size,
        "pos": pos_subtasks_dev_size,
        "neg": neg_subtasks_dev_size
    }
    model_config["test_data"] = {
        "size": total_test_size,
        "pos": pos_subtasks_test_size,
        "neg": neg_subtasks_test_size
    }

    # Extract subtasks data for dev and test
    dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks)
    test_subtasks_data = split_data_based_on_subtasks(test_data,
                                                      model.subtasks)

    # Load the instances into pytorch dataset
    train_dataset = COVID19TaskDataset(train_data)
    dev_dataset = COVID19TaskDataset(dev_data)
    test_dataset = COVID19TaskDataset(test_data)
    logging.info("Loaded the datasets into Pytorch datasets")

    tokenize_collator = TokenizeCollator(tokenizer, model.subtasks,
                                         entity_start_token_id)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=POSSIBLE_BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=0,
                                  collate_fn=tokenize_collator)
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=POSSIBLE_BATCH_SIZE,
                                shuffle=False,
                                num_workers=0,
                                collate_fn=tokenize_collator)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=POSSIBLE_BATCH_SIZE,
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=tokenize_collator)
    logging.info("Created train and test dataloaders with batch aggregation")

    # Only retrain if needed
    if args.retrain:
        print('DO RETRAIN')
        ##################################################################################################
        # NOTE: Training Tutorial Reference
        # https://mccormickml.com/2019/07/22/BERT-fine-tuning/#41-bertforsequenceclassification
        ##################################################################################################

        # Create an optimizer training schedule for the BERT text classification model
        # NOTE: AdamW is a class from the huggingface library (as opposed to pytorch)
        # I believe the 'W' stands for 'Weight Decay fix"
        # Recommended Schedule for BERT fine-tuning as per the paper
        # Batch size: 16, 32
        # Learning rate (Adam): 5e-5, 3e-5, 2e-5
        # Number of epochs: 2, 3, 4
        optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
        logging.info("Created model optimizer")
        # Number of training epochs. The BERT authors recommend between 2 and 4.
        # We chose to run for 4, but we'll see later that this may be over-fitting the
        # training data.
        epochs = args.n_epochs

        # Total number of training steps is [number of batches] x [number of epochs].
        # (Note that this is not the same as the number of training samples).
        total_steps = len(train_dataloader) * epochs

        # Create the learning rate scheduler.
        # NOTE: num_warmup_steps = 0 is the Default value in run_glue.py
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        # We'll store a number of quantities such as training and validation loss,
        # validation accuracy, and timings.
        training_stats = []

        logging.info(f"Initiating training loop for {args.n_epochs} epochs...")
        # Measure the total training time for the whole run.
        total_start_time = time.time()

        # Find the accumulation steps
        accumulation_steps = args.batch_size / POSSIBLE_BATCH_SIZE

        # Loss trajectory for epochs
        epoch_train_loss = list()
        # Dev validation trajectory
        dev_subtasks_validation_statistics = {
            subtask: list()
            for subtask in model.subtasks
        }
        for epoch in range(epochs):
            pbar = tqdm(train_dataloader)
            logging.info(f"Initiating Epoch {epoch+1}:")
            # Reset the total loss for each epoch.
            total_train_loss = 0
            train_loss_trajectory = list()

            # Reset timer for each epoch
            start_time = time.time()
            model.train()

            dev_log_frequency = 5
            n_steps = len(train_dataloader)
            dev_steps = int(n_steps / dev_log_frequency)
            for step, batch in enumerate(pbar):
                # Upload labels of each subtask to device
                for subtask in model.subtasks:
                    subtask_labels = batch["gold_labels"][subtask]
                    subtask_labels = subtask_labels.to(device)
                    # print("HAHAHAHAH:", subtask_labels.is_cuda)
                    batch["gold_labels"][subtask] = subtask_labels
                    # print("HAHAHAHAH:", batch["gold_labels"][subtask].is_cuda)
                # Forward
                input_dict = {
                    "input_ids":
                    batch["input_ids"].to(device),
                    "entity_start_positions":
                    batch["entity_start_positions"].to(device),
                    "labels":
                    batch["gold_labels"]
                }

                input_ids = batch["input_ids"]
                entity_start_positions = batch["entity_start_positions"]
                gold_labels = batch["gold_labels"]
                batch_data = batch["batch_data"]
                loss, logits = model(**input_dict)
                # loss = loss / accumulation_steps
                # Accumulate loss
                total_train_loss += loss.item()

                # Backward: compute gradients
                loss.backward()

                if (step + 1) % accumulation_steps == 0:

                    # Calculate elapsed time in minutes and print loss on the tqdm bar
                    elapsed = format_time(time.time() - start_time)
                    avg_train_loss = total_train_loss / (step + 1)
                    # keep track of changing avg_train_loss
                    train_loss_trajectory.append(avg_train_loss)
                    pbar.set_description(
                        f"Epoch:{epoch+1}|Batch:{step}/{len(train_dataloader)}|Time:{elapsed}|Avg. Loss:{avg_train_loss:.4f}|Loss:{loss.item():.4f}"
                    )

                    # Clip the norm of the gradients to 1.0.
                    # This is to help prevent the "exploding gradients" problem.
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                    # Update parameters
                    optimizer.step()

                    # Clean the model's previous gradients
                    model.zero_grad()  # Reset gradients tensors

                    # Update the learning rate.
                    scheduler.step()
                    pbar.update()
                if (step + 1) % dev_steps == 0:
                    # Perform validation with the model and log the performance
                    logging.info("Running Validation...")
                    # Put the model in evaluation mode--the dropout layers behave differently
                    # during evaluation.
                    model.eval()
                    dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(
                        dev_dataloader, model, device, args.task + "_dev",
                        True)
                    for subtask in model.subtasks:
                        dev_subtask_data = dev_subtasks_data[subtask]
                        dev_subtask_prediction_scores = dev_prediction_scores[
                            subtask]
                        dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(
                            dev_subtask_data, dev_subtask_prediction_scores)
                        logging.info(
                            f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}"
                        )
                        dev_subtasks_validation_statistics[subtask].append(
                            (epoch + 1, step + 1, dev_TP + dev_FN, dev_F1,
                             dev_P, dev_R, dev_TP, dev_FP, dev_FN))

                    # logging.info("DEBUG:Validation on Test")
                    # dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task + "_dev", True)
                    # for subtask in model.subtasks:
                    # 	dev_subtask_data = test_subtasks_data[subtask]
                    # 	dev_subtask_prediction_scores = dev_prediction_scores[subtask]
                    # 	dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(dev_subtask_data, dev_subtask_prediction_scores)
                    # 	logging.info(f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}")
                    # 	dev_subtasks_validation_statistics[subtask].append((epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN))
                    # Put the model back in train setting
                    model.train()

            # Calculate the average loss over all of the batches.
            avg_train_loss = total_train_loss / len(train_dataloader)

            training_time = format_time(time.time() - start_time)

            # Record all statistics from this epoch.
            training_stats.append({
                'epoch': epoch + 1,
                'Training Loss': avg_train_loss,
                'Training Time': training_time
            })

            # Save the loss trajectory
            epoch_train_loss.append(train_loss_trajectory)

        logging.info(
            f"Training complete with total Train time:{format_time(time.time()- total_start_time)}"
        )
        log_list(training_stats)

        # Save the model and the Tokenizer here:
        logging.info(
            f"Saving the model and tokenizer in {args.save_directory}")
        model.save_pretrained(args.save_directory)
        # Save each subtask classifiers weights to individual state dicts
        for subtask, classifier in model.classifiers.items():
            classifier_save_file = os.path.join(args.save_directory,
                                                f"{subtask}_classifier.bin")
            logging.info(
                f"Saving the model's {subtask} classifier weights at {classifier_save_file}"
            )
            torch.save(classifier.state_dict(), classifier_save_file)
        tokenizer.save_pretrained(args.save_directory)

        # Plot the train loss trajectory in a plot
        train_loss_trajectory_plot_file = os.path.join(
            args.output_dir, "train_loss_trajectory.png")
        logging.info(
            f"Saving the Train loss trajectory at {train_loss_trajectory_plot_file}"
        )
        plot_train_loss(epoch_train_loss, train_loss_trajectory_plot_file)

        # TODO: Plot the validation performance
        # Save dev_subtasks_validation_statistics
    else:
        logging.info("No training needed. Directly going to evaluation!")

    # Save the model name in the model_config file
    model_config["model"] = "MultiTaskBertForCovidEntityClassification"
    model_config["epochs"] = args.n_epochs

    # Find best threshold for each subtask based on dev set performance
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(
        test_dataloader, model, device, args.task, True)
    dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(
        dev_dataloader, model, device, args.task + "_dev", True)

    best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks}
    best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks}
    best_test_F1s = {subtask: 0.0 for subtask in model.subtasks}
    best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks}
    test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks}
    dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks}
    # for subtask in model.subtasks:
    # 	test_subtask_data = test_subtasks_data[subtask]
    # 	test_subtask_prediction_scores = test_prediction_scores[subtask]
    # 	for t in thresholds:
    # 		test_F1, test_P, test_R, test_TP, test_FP, test_FN = get_TP_FP_FN(test_subtask_data, test_subtask_prediction_scores, THRESHOLD=t)
    # 		test_subtasks_t_F1_P_Rs[subtask].append((t, test_F1, test_P, test_R, test_TP + test_FN, test_TP, test_FP, test_FN))
    # 		if test_F1 > best_test_F1s[subtask]:
    # 			best_test_thresholds[subtask] = t
    # 			best_test_F1s[subtask] = test_F1

    # 	logging.info(f"Subtask:{subtask:>15}")
    # 	log_list(test_subtasks_t_F1_P_Rs[subtask])
    # 	logging.info(f"Best Test Threshold for subtask: {best_test_thresholds[subtask]}\t Best test F1: {best_test_F1s[subtask]}")

    for subtask in model.subtasks:
        dev_subtask_data = dev_subtasks_data[subtask]
        dev_subtask_prediction_scores = dev_prediction_scores[subtask]
        for t in thresholds:
            dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(
                dev_subtask_data, dev_subtask_prediction_scores, THRESHOLD=t)
            dev_subtasks_t_F1_P_Rs[subtask].append(
                (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP,
                 dev_FN))
            if dev_F1 > best_dev_F1s[subtask]:
                best_dev_thresholds[subtask] = t
                best_dev_F1s[subtask] = dev_F1

        logging.info(f"Subtask:{subtask:>15}")
        log_list(dev_subtasks_t_F1_P_Rs[subtask])
        logging.info(
            f"Best Dev Threshold for subtask: {best_dev_thresholds[subtask]}\t Best dev F1: {best_dev_F1s[subtask]}"
        )

    # Save the best dev threshold and dev_F1 in results dict
    results["best_dev_threshold"] = best_dev_thresholds
    results["best_dev_F1s"] = best_dev_F1s
    results["dev_t_F1_P_Rs"] = dev_subtasks_t_F1_P_Rs

    # Evaluate on Test
    logging.info("Testing on test dataset")
    # test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task)

    predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset(
        test_dataloader, model, device, args.task)

    # Test
    for subtask in model.subtasks:
        logging.info(f"Testing the trained classifier on subtask: {subtask}")
        # print(len(test_dataloader))
        # print(len(prediction_scores[subtask]))
        # print(len(test_subtasks_data[subtask]))
        results[subtask] = dict()
        cm = metrics.confusion_matrix(gold_labels[subtask],
                                      predicted_labels[subtask])
        classification_report = metrics.classification_report(
            gold_labels[subtask], predicted_labels[subtask], output_dict=True)
        logging.info(cm)
        logging.info(
            metrics.classification_report(gold_labels[subtask],
                                          predicted_labels[subtask]))
        results[subtask]["CM"] = cm.tolist(
        )  # Storing it as list of lists instead of numpy.ndarray
        results[subtask]["Classification Report"] = classification_report

        # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation)
        EM_score, F1_score, total = get_raw_scores(test_subtasks_data[subtask],
                                                   prediction_scores[subtask])
        logging.info("Word overlap based SQuAD evaluation style metrics:")
        logging.info(f"Total number of cases: {total}")
        logging.info(f"EM_score: {EM_score}")
        logging.info(f"F1_score: {F1_score}")
        results[subtask]["SQuAD_EM"] = EM_score
        results[subtask]["SQuAD_F1"] = F1_score
        results[subtask]["SQuAD_total"] = total
        pos_EM_score, pos_F1_score, pos_total = get_raw_scores(
            test_subtasks_data[subtask],
            prediction_scores[subtask],
            positive_only=True)
        logging.info(f"Total number of Positive cases: {pos_total}")
        logging.info(f"Pos. EM_score: {pos_EM_score}")
        logging.info(f"Pos. F1_score: {pos_F1_score}")
        results[subtask]["SQuAD_Pos. EM"] = pos_EM_score
        results[subtask]["SQuAD_Pos. F1"] = pos_F1_score
        results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total

        # New evaluation suggested by Alan
        F1, P, R, TP, FP, FN = get_TP_FP_FN(
            test_subtasks_data[subtask],
            prediction_scores[subtask],
            THRESHOLD=best_dev_thresholds[subtask])
        logging.info("New evaluation scores:")
        logging.info(f"F1: {F1}")
        logging.info(f"Precision: {P}")
        logging.info(f"Recall: {R}")
        logging.info(f"True Positive: {TP}")
        logging.info(f"False Positive: {FP}")
        logging.info(f"False Negative: {FN}")
        results[subtask]["F1"] = F1
        results[subtask]["P"] = P
        results[subtask]["R"] = R
        results[subtask]["TP"] = TP
        results[subtask]["FP"] = FP
        results[subtask]["FN"] = FN
        N = TP + FN
        results[subtask]["N"] = N

        # # Top predictions in the Test case
        # prediction_scores[subtask] = np.array(prediction_scores[subtask])
        # sorted_prediction_ids = np.argsort(-prediction_scores[subtask])
        # K = 200
        # logging.info("Top {} predictions:".format(K))
        # logging.info("\t".join(["Tweet", "BERT model input", "candidate chunk", "prediction score", "predicted label", "gold label", "gold chunks"]))
        # for i in range(K):
        # 	instance_id = sorted_prediction_ids[i]
        # 	# text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label
        # 	tweet = test_subtasks_data[subtask][instance_id][0].replace("\n", " ")
        # 	chunk = test_subtasks_data[subtask][instance_id][1]
        # 	tokenized_tweet_with_masked_chunk = test_subtasks_data[subtask][instance_id][6]
        # 	if chunk in ["AUTHOR OF THE TWEET", "NEAR AUTHOR OF THE TWEET"]:
        # 		# First element of the text will be considered as AUTHOR OF THE TWEET or NEAR AUTHOR OF THE TWEET
        # 		bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> </E>")
        # 		# print(tokenized_tweet_with_masked_chunk)
        # 		# print(bert_model_input_text)
        # 		# exit()
        # 	else:
        # 		bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> " + chunk + " </E>")
        # 	list_to_print = [tweet, bert_model_input_text, chunk, str(prediction_scores[subtask][instance_id]), str(predicted_labels[subtask][instance_id]), str(test_subtasks_data[subtask][instance_id][-1]), str(test_subtasks_data[subtask][instance_id][-2])]
        # 	logging.info("\t".join(list_to_print))

    # Save model_config and results
    model_config_file = os.path.join(args.output_dir, "model_config.json")
    results_file = os.path.join(args.output_dir, "results.json")
    logging.info(f"Saving model config at {model_config_file}")
    save_in_json(model_config, model_config_file)
    logging.info(f"Saving results at {results_file}")
    save_in_json(results, results_file)
예제 #8
0
def train(choice, dirname, window):
    #  --------------------------------------------------------------------------------------------------------------------
    dimensionality = 50  # No need to adjust, unless you want to experiment with custom embeddings
    print("Dimensionality:", dimensionality)
    regex = re.compile(r"[+-.]?\d+[-.,\d+:]*(th|st|nd|rd)?")

    if choice == 'imm':
        base = '_imm'
    elif choice == 'prewin':
        base = '_prewin'
    style = 'test'
    mlmr_dir = dirname
    seq_length = window  # Adjust to 5 for PreWin and 5, 10, 50 for baseline results

    neg = load_from_pickle("{}/wiki_LOCATION_{}{}.pkl".format(
        mlmr_dir, style, base))
    pos = load_from_pickle("{}/wiki_INSTITUTE_{}{}.pkl".format(
        mlmr_dir, style, base))
    if path.exists("{}/wiki_EVENT_{}{}.pkl".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_pickle("{}/wiki_EVENT_{}{}.pkl".format(
                mlmr_dir, style, base)))
    if path.exists("{}/wiki_TEAM_{}{}.pkl".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_pickle("{}/wiki_TEAM_{}{}.pkl".format(
                mlmr_dir, style, base)))
    if path.exists("{}/wiki_ARTIFACT_{}{}.pkl".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_pickle("{}/wiki_ARTIFACT_{}{}.pkl".format(
                mlmr_dir, style, base)))

    print("Sequence Length: 2 times ", seq_length)

    A = []
    dep_labels = {u"<u>"}
    for coll in [neg, pos]:
        for l in coll:
            A.append(l)
            dep_labels.update(set(l[1][-seq_length:] + l[3][:seq_length]))

    random.shuffle(A)

    X_L, D_L, X_R, D_R, Y = [], [], [], [], []
    for a in A:
        X_L.append(a[0][-seq_length:])
        D_L.append(a[1][-seq_length:])
        X_R.append(a[2][:seq_length])
        D_R.append(a[3][:seq_length])
        Y.append(a[4])

    print('No of training examples: ', len(X_L))
    dump_to_pickle("dep_labels.pkl", dep_labels)
    dep_labels = load_from_pickle("dep_labels.pkl")
    #  --------------------------------------------------------------------------------------------------------------------
    vocabulary = {u"<u>", u"0.0"}
    vocab_limit = 100000
    print('Vocabulary Size: ', vocab_limit)
    print("Building sequences...")

    count = 0
    vectors_glove = {u'<u>': np.ones(dimensionality)}
    # Please supply your own embeddings, see README.md for details
    for line in codecs.open("glove.6B.50d.txt", encoding="utf-8"):
        tokens = line.split()
        vocabulary.add(tokens[0])
        vectors_glove[tokens[0]] = [float(x) for x in tokens[1:]]
        count += 1
        if count >= vocab_limit:
            break

    vectors_glove[u"0.0"] = np.zeros(dimensionality)
    word_to_index = dict([(w, i) for i, w in enumerate(vocabulary)])
    dep_to_index = dict([(w, i) for i, w in enumerate(dep_labels)])

    for x_l, x_r, d_l, d_r in zip(X_L, X_R, D_L, D_R):
        for i, w in enumerate(x_l):
            if w != u"0.0":
                w = regex.sub(u"1", w)
            if w in word_to_index:
                x_l[i] = word_to_index[w]
            else:
                x_l[i] = word_to_index[u"<u>"]
        for i, w in enumerate(x_r):
            if w != u"0.0":
                w = regex.sub(u"1", w)
            if w in word_to_index:
                x_r[i] = word_to_index[w]
            else:
                x_r[i] = word_to_index[u"<u>"]
        for i, w in enumerate(d_l):
            arr = np.zeros(len(dep_labels))
            if w in dep_to_index:
                arr[dep_to_index[w]] = 1
            else:
                arr[dep_to_index[u"<u>"]] = 1
            d_l[i] = arr
        for i, w in enumerate(d_r):
            arr = np.zeros(len(dep_labels))
            if w in dep_to_index:
                arr[dep_to_index[w]] = 1
            else:
                arr[dep_to_index[u"<u>"]] = 1
            d_r[i] = arr

    X_L = np.asarray(X_L)
    X_R = np.asarray(X_R)
    D_L = np.asarray(D_L)
    D_R = np.asarray(D_R)
    Y = np.asarray(Y)

    # convert labels to one-hot format
    num_classes = Y.max() + 1
    one_hot = np.zeros((Y.size, num_classes))
    one_hot[np.arange(Y.size), Y] = 1
    Y = one_hot

    weights = np.zeros((len(vocabulary), dimensionality))
    for w in vocabulary:
        if w in vectors_glove:
            weights[word_to_index[w]] = vectors_glove[w]
    weights = np.array([weights])

    print(u"Done...")
    #  --------------------------------------------------------------------------------------------------------------------
    print(u'Building model...')
    first_input = Input(shape=(seq_length, ))
    a = Embedding(len(vocabulary),
                  dimensionality,
                  input_length=(seq_length, ),
                  embeddings_initializer=Constant(weights))(first_input)
    b = LSTM(units=15)(a)
    first_output = Dropout(0.2)(b)
    model_left = Model(inputs=first_input, outputs=first_output)

    second_input = Input(shape=(seq_length, len(dep_labels)))
    a = TimeDistributed(Dense(units=15))(second_input)
    b = Dropout(0.2)(a)
    second_output = Flatten()(b)
    dep_left = Model(inputs=second_input, outputs=second_output)

    third_input = Input(shape=(seq_length, ))
    a = Embedding(len(vocabulary),
                  dimensionality,
                  input_length=(seq_length, ),
                  embeddings_initializer=Constant(weights))(third_input)
    b = LSTM(units=15, go_backwards=True)(a)
    third_output = Dropout(0.2)(b)
    model_right = Model(inputs=third_input, outputs=third_output)

    fourth_input = Input(shape=(seq_length, len(dep_labels)))
    a = TimeDistributed(Dense(units=15))(fourth_input)
    b = Dropout(0.2)(a)
    fourth_output = Flatten()(b)
    dep_right = Model(inputs=fourth_input, outputs=fourth_output)

    a = concatenate([first_output, second_output, third_output, fourth_output])
    b = Dense(10)(a)
    c = Dense(num_classes, activation='softmax')(b)
    merged_model = Model(
        inputs=[first_input, second_input, third_input, fourth_input],
        outputs=c)
    merged_model.compile(loss='categorical_crossentropy',
                         optimizer='adagrad',
                         metrics=['accuracy'])
    print(u"Done...")
    #  --------------------------------------------------------------------------------------------------------------------
    checkpoint = ModelCheckpoint(filepath="lstm.hdf5", verbose=0)
    merged_model.fit([X_L, D_L, X_R, D_R],
                     Y,
                     batch_size=16,
                     epochs=5,
                     callbacks=[checkpoint],
                     verbose=0)
예제 #9
0
#            collection="test_pd", *args, **kwargs):
#     """ Read a dataset with metadata compute metadata and
#     save to json from a tuple(url,format,id)"""
#
#     d, stringio = response
#     if stringio is not None:
#         extension, mid, url = d['format'], d['id_metadata'], d['url']
#         meta = {}
#         try:
#             if extension == 'csv':
#                 meta = DataExploration(pd.read_csv(stringio)).metadata()
#             if extension == 'html':
#                 meta = DataExploration(pd.read_html(stringio)).metadata()
#             if extension == 'xslx':
#                 meta = DataExploration(pd.read_excel(stringio)).metadata()
#         except Exception as e:
#             logger.exception(e)
#         meta['id_metadata'] = mid
#         meta['url'] = url
#         print(meta)
#         #write_to_json_line(meta, file_path)
#         logger.info('{} : Successful write_to_json_line '.format(url))
#         # db[collection].insert_one(meta)


if __name__ == "__main__":
    # list_d = get_list_csv(DB['datagov'])
    list_d = load_from_pickle('list_urls')
    for d in tqdm.tqdm(list_d):
        get_meta(downloader_s(d), collection="test5")
예제 #10
0
def main():
    # Read all the data instances
    task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle(
        args.data_file)
    data, subtasks_list = get_multitask_instances_for_valid_tasks(
        task_instances_dict, tag_statistics)

    if args.retrain:
        logging.info("Creating and training the model from 'bert-base-cased' ")
        # Create the save_directory if not exists
        make_dir_if_not_exists(args.save_directory)

        # Initialize tokenizer and model with pretrained weights
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
        config = BertConfig.from_pretrained('bert-base-cased')
        config.subtasks = subtasks_list
        # print(config)
        model = MultiTaskBertForCovidEntityClassification.from_pretrained(
            'bert-base-cased', config=config)

        # Add new tokens in tokenizer
        new_special_tokens_dict = {
            "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"]
        }
        # new_special_tokens_dict = {"additional_special_tokens": ["<E>", "</E>"]}
        tokenizer.add_special_tokens(new_special_tokens_dict)

        # Add the new embeddings in the weights
        print("Embeddings type:",
              model.bert.embeddings.word_embeddings.weight.data.type())
        print("Embeddings shape:",
              model.bert.embeddings.word_embeddings.weight.data.size())
        embedding_size = model.bert.embeddings.word_embeddings.weight.size(1)
        new_embeddings = torch.FloatTensor(
            len(new_special_tokens_dict["additional_special_tokens"]),
            embedding_size).uniform_(-0.1, 0.1)
        # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1)
        print("new_embeddings shape:", new_embeddings.size())
        new_embedding_weight = torch.cat(
            (model.bert.embeddings.word_embeddings.weight.data,
             new_embeddings), 0)
        model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight
        print("Embeddings shape:",
              model.bert.embeddings.word_embeddings.weight.data.size())
        # Update model config vocab size
        model.config.vocab_size = model.config.vocab_size + len(
            new_special_tokens_dict["additional_special_tokens"])
    else:
        # Load the tokenizer and model from the save_directory
        tokenizer = BertTokenizer.from_pretrained(args.save_directory)
        model = MultiTaskBertForCovidEntityClassification.from_pretrained(
            args.save_directory)
        # print(model.state_dict().keys())
        # TODO save and load the subtask classifier weights separately
        # Load from individual state dicts
        for subtask in model.subtasks:
            model.classifiers[subtask].load_state_dict(
                torch.load(
                    os.path.join(args.save_directory,
                                 f"{subtask}_classifier.bin")))
        # print(model.config)
        # exit()
    model.to(device)
    # Explicitly move the classifiers to device
    for subtask, classifier in model.classifiers.items():
        classifier.to(device)
    entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0]

    logging.info(
        f"Task dataset for task: {args.task} loaded from {args.data_file}.")

    model_config = dict()
    results = dict()

    test_data = data
    logging.info("Test Data:")
    total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics(
        test_data, model.subtasks)
    logging.info("\n")
    # model_config["train_data"] = {"size":total_train_size, "pos":pos_subtasks_train_size, "neg":neg_subtasks_train_size}
    # model_config["dev_data"] = {"size":total_dev_size, "pos":pos_subtasks_dev_size, "neg":neg_subtasks_dev_size}
    model_config["test_data"] = {
        "size": total_test_size,
        "pos": pos_subtasks_test_size,
        "neg": neg_subtasks_test_size
    }

    # Extract subtasks data for dev and test
    #dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks)
    test_subtasks_data = split_data_based_on_subtasks(test_data,
                                                      model.subtasks)

    # Load the instances into pytorch dataset
    # train_dataset = COVID19TaskDataset(train_data)
    # dev_dataset = COVID19TaskDataset(dev_data)
    test_dataset = COVID19TaskDataset(test_data)
    logging.info("Loaded the datasets into Pytorch datasets")

    tokenize_collator = TokenizeCollator(tokenizer, model.subtasks,
                                         entity_start_token_id)
    # train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator)
    # dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=POSSIBLE_BATCH_SIZE,
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=tokenize_collator)
    logging.info("Created train and test dataloaders with batch aggregation")

    # Save the model name in the model_config file
    model_config["model"] = "MultiTaskBertForCovidEntityClassification"
    model_config["epochs"] = args.n_epochs

    # Find best threshold for each subtask based on dev set performance
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(
        test_dataloader, model, device, args.task, True)
    #dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(dev_dataloader, model, device, args.task + "_dev", True)

    best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks}
    best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks}
    best_test_F1s = {subtask: 0.0 for subtask in model.subtasks}
    best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks}
    test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks}
    dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks}

    # Evaluate on Test
    logging.info("Testing on test dataset")
    # test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task)

    predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset(
        test_dataloader, model, device, args.task)

    # Test
    for subtask in model.subtasks:
        logging.info(f"Testing the trained classifier on subtask: {subtask}")
        # print(len(test_dataloader))
        # print(len(prediction_scores[subtask]))
        # print(len(test_subtasks_data[subtask]))
        results[subtask] = dict()
        cm = metrics.confusion_matrix(gold_labels[subtask],
                                      predicted_labels[subtask])
        classification_report = metrics.classification_report(
            gold_labels[subtask], predicted_labels[subtask], output_dict=True)
        logging.info(cm)
        logging.info(
            metrics.classification_report(gold_labels[subtask],
                                          predicted_labels[subtask]))
        results[subtask]["CM"] = cm.tolist(
        )  # Storing it as list of lists instead of numpy.ndarray
        results[subtask]["Classification Report"] = classification_report

        # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation)
        EM_score, F1_score, total = get_raw_scores(test_subtasks_data[subtask],
                                                   prediction_scores[subtask])
        logging.info("Word overlap based SQuAD evaluation style metrics:")
        logging.info(f"Total number of cases: {total}")
        logging.info(f"EM_score: {EM_score}")
        logging.info(f"F1_score: {F1_score}")
        results[subtask]["SQuAD_EM"] = EM_score
        results[subtask]["SQuAD_F1"] = F1_score
        results[subtask]["SQuAD_total"] = total
        pos_EM_score, pos_F1_score, pos_total = get_raw_scores(
            test_subtasks_data[subtask],
            prediction_scores[subtask],
            positive_only=True)
        logging.info(f"Total number of Positive cases: {pos_total}")
        logging.info(f"Pos. EM_score: {pos_EM_score}")
        logging.info(f"Pos. F1_score: {pos_F1_score}")
        results[subtask]["SQuAD_Pos. EM"] = pos_EM_score
        results[subtask]["SQuAD_Pos. F1"] = pos_F1_score
        results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total

        # New evaluation suggested by Alan
        F1, P, R, TP, FP, FN = get_TP_FP_FN(
            test_subtasks_data[subtask],
            prediction_scores[subtask],
            THRESHOLD=best_dev_thresholds[subtask])
        logging.info("New evaluation scores:")
        logging.info(f"F1: {F1}")
        logging.info(f"Precision: {P}")
        logging.info(f"Recall: {R}")
        logging.info(f"True Positive: {TP}")
        logging.info(f"False Positive: {FP}")
        logging.info(f"False Negative: {FN}")
        results[subtask]["F1"] = F1
        results[subtask]["P"] = P
        results[subtask]["R"] = R
        results[subtask]["TP"] = TP
        results[subtask]["FP"] = FP
        results[subtask]["FN"] = FN
        N = TP + FN
        results[subtask]["N"] = N

        # # Top predictions in the Test case
        # prediction_scores[subtask] = np.array(prediction_scores[subtask])
        # sorted_prediction_ids = np.argsort(-prediction_scores[subtask])
        # K = 200
        # logging.info("Top {} predictions:".format(K))
        # logging.info("\t".join(["Tweet", "BERT model input", "candidate chunk", "prediction score", "predicted label", "gold label", "gold chunks"]))
        # for i in range(K):
        # 	instance_id = sorted_prediction_ids[i]
        # 	# text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label
        # 	tweet = test_subtasks_data[subtask][instance_id][0].replace("\n", " ")
        # 	chunk = test_subtasks_data[subtask][instance_id][1]
        # 	tokenized_tweet_with_masked_chunk = test_subtasks_data[subtask][instance_id][6]
        # 	if chunk in ["AUTHOR OF THE TWEET", "NEAR AUTHOR OF THE TWEET"]:
        # 		# First element of the text will be considered as AUTHOR OF THE TWEET or NEAR AUTHOR OF THE TWEET
        # 		bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> </E>")
        # 		# print(tokenized_tweet_with_masked_chunk)
        # 		# print(bert_model_input_text)
        # 		# exit()
        # 	else:
        # 		bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> " + chunk + " </E>")
        # 	list_to_print = [tweet, bert_model_input_text, chunk, str(prediction_scores[subtask][instance_id]), str(predicted_labels[subtask][instance_id]), str(test_subtasks_data[subtask][instance_id][-1]), str(test_subtasks_data[subtask][instance_id][-2])]
        # 	logging.info("\t".join(list_to_print))

    # Save model_config and results
    model_config_file = os.path.join(args.output_dir, "model_config.json")
    results_file = os.path.join(args.output_dir, "results.json")
    logging.info(f"Saving model config at {model_config_file}")
    save_in_json(model_config, model_config_file)
    logging.info(f"Saving results at {results_file}")
    save_in_json(results, results_file)
def main():
    task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle(
        args.data_file)
    data = extract_instances_for_current_subtask(task_instances_dict,
                                                 args.sub_task)
    logging.info(
        f"Task dataset for task: {args.task} loaded from {args.data_file}.")

    model_config = dict()
    results = dict()

    # Split the data into train, dev and test and shuffle the train segment
    train_data, dev_data, test_data = split_instances_in_train_dev_test(data)
    random.shuffle(train_data)  # shuffle happens in-place
    logging.info("Train Data:")
    total_train_size, pos_train_size, neg_train_size = log_data_statistics(
        train_data)
    logging.info("Dev Data:")
    total_dev_size, pos_dev_size, neg_dev_size = log_data_statistics(dev_data)
    logging.info("Test Data:")
    total_test_size, pos_test_size, neg_test_size = log_data_statistics(
        test_data)
    logging.info("\n")
    model_config["train_data"] = {
        "size": total_train_size,
        "pos": pos_train_size,
        "neg": neg_train_size
    }
    model_config["dev_data"] = {
        "size": total_dev_size,
        "pos": pos_dev_size,
        "neg": neg_dev_size
    }
    model_config["test_data"] = {
        "size": total_test_size,
        "pos": pos_test_size,
        "neg": neg_test_size
    }

    # Extract n-gram features from the train data
    # Returned ngrams will be dict of dict
    # TODO: update the feature extractor
    feature2i, i2feature = create_ngram_features_from(train_data)
    logging.info(
        f"Total number of features extracted from train = {len(feature2i)}, {len(i2feature)}"
    )
    model_config["features"] = {"size": len(feature2i)}

    # Extract Feature vectors and labels from train and test data
    train_X, train_Y = convert_data_to_feature_vector_and_labels(
        train_data, feature2i)
    dev_X, dev_Y = convert_data_to_feature_vector_and_labels(
        dev_data, feature2i)
    test_X, test_Y = convert_data_to_feature_vector_and_labels(
        test_data, feature2i)
    logging.info(
        f"Train Data Features = {train_X.shape} and Labels = {len(train_Y)}")
    logging.info(
        f"Dev Data Features = {dev_X.shape} and Labels = {len(dev_Y)}")
    logging.info(
        f"Test Data Features = {test_X.shape} and Labels = {len(test_Y)}")
    model_config["train_data"]["features_shape"] = train_X.shape
    model_config["train_data"]["labels_shape"] = len(train_Y)
    model_config["dev_data"]["features_shape"] = dev_X.shape
    model_config["dev_data"]["labels_shape"] = len(dev_Y)
    model_config["test_data"]["features_shape"] = test_X.shape
    model_config["test_data"]["labels_shape"] = len(test_Y)

    # Train logistic regression classifier
    logging.info("Training the Logistic Regression classifier")
    lr = LogisticRegression(solver='lbfgs')
    lr.fit(train_X, train_Y)
    model_config["model"] = "LogisticRegression(solver='lbfgs')"

    # Find best threshold based on dev set performance
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

    dev_prediction_probs = lr.predict_proba(dev_X)[:, 1]
    dev_t_F1_P_Rs = list()
    best_threshold_based_on_F1 = 0.5
    best_dev_F1 = 0.0
    for t in thresholds:
        dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(
            dev_data, dev_prediction_probs, THRESHOLD=t)
        dev_t_F1_P_Rs.append(
            (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN))
        if dev_F1 > best_dev_F1:
            best_threshold_based_on_F1 = t
            best_dev_F1 = dev_F1
    log_list(dev_t_F1_P_Rs)
    logging.info(
        f"Best Threshold: {best_threshold_based_on_F1}\t Best dev F1: {best_dev_F1}"
    )
    # Save the best dev threshold and dev_F1 in results dict
    results["best_dev_threshold"] = best_threshold_based_on_F1
    results["best_dev_F1"] = best_dev_F1
    results["dev_t_F1_P_Rs"] = dev_t_F1_P_Rs
    # y_pred = (clf.predict_proba(X_test)[:,1] >= 0.3).astype(bool)

    # Test
    logging.info("Testing the trained classifier")
    predictions = lr.predict(test_X)
    probs = lr.predict_proba(test_X)
    test_Y_prediction_probs = probs[:, 1]
    cm = metrics.confusion_matrix(test_Y, predictions)
    classification_report = metrics.classification_report(test_Y,
                                                          predictions,
                                                          output_dict=True)
    logging.info(cm)
    logging.info(metrics.classification_report(test_Y, predictions))
    results["CM"] = cm.tolist(
    )  # Storing it as list of lists instead of numpy.ndarray
    results["Classification Report"] = classification_report

    # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation)
    EM_score, F1_score, total = get_raw_scores(test_data,
                                               test_Y_prediction_probs)
    logging.info("Word overlap based SQuAD evaluation style metrics:")
    logging.info(f"Total number of cases: {total}")
    logging.info(f"EM_score: {EM_score}")
    logging.info(f"F1_score: {F1_score}")
    results["SQuAD_EM"] = EM_score
    results["SQuAD_F1"] = F1_score
    results["SQuAD_total"] = total
    pos_EM_score, pos_F1_score, pos_total = get_raw_scores(
        test_data, test_Y_prediction_probs, positive_only=True)
    logging.info(f"Total number of Positive cases: {pos_total}")
    logging.info(f"Pos. EM_score: {pos_EM_score}")
    logging.info(f"Pos. F1_score: {pos_F1_score}")
    results["SQuAD_Pos. EM"] = pos_EM_score
    results["SQuAD_Pos. F1"] = pos_F1_score
    results["SQuAD_Pos. EM_F1_total"] = pos_total

    # New evaluation suggested by Alan
    F1, P, R, TP, FP, FN = get_TP_FP_FN(test_data,
                                        test_Y_prediction_probs,
                                        THRESHOLD=best_threshold_based_on_F1)
    logging.info("New evaluation scores:")
    logging.info(f"F1: {F1}")
    logging.info(f"Precision: {P}")
    logging.info(f"Recall: {R}")
    logging.info(f"True Positive: {TP}")
    logging.info(f"False Positive: {FP}")
    logging.info(f"False Negative: {FN}")
    results["F1"] = F1
    results["P"] = P
    results["R"] = R
    results["TP"] = TP
    results["FP"] = FP
    results["FN"] = FN
    N = TP + FN
    results["N"] = N

    # Top predictions in the Test case
    sorted_prediction_ids = np.argsort(-test_Y_prediction_probs)
    K = 30
    logging.info("Top {} predictions:".format(K))
    for i in range(K):
        instance_id = sorted_prediction_ids[i]
        # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label
        list_to_print = [
            test_data[instance_id][0], test_data[instance_id][6],
            test_data[instance_id][1],
            str(test_Y_prediction_probs[instance_id]),
            str(test_Y[instance_id]),
            str(test_data[instance_id][-1]),
            str(test_data[instance_id][-2])
        ]
        logging.info("\t".join(list_to_print))

    # Top feature analysis
    coefs = lr.coef_[0]
    K = 10
    sorted_feature_ids = np.argsort(-coefs)
    logging.info("Top {} features:".format(K))
    for i in range(K):
        feature_id = sorted_feature_ids[i]
        logging.info(f"{i2feature[feature_id]}\t{coefs[feature_id]}")

    # Plot the precision recall curve
    save_figure_file = os.path.join(args.output_dir,
                                    "Precision Recall Curve.png")
    logging.info(f"Saving precision recall curve at {save_figure_file}")
    disp = plot_precision_recall_curve(lr, test_X, test_Y)
    disp.ax_.set_title('2-class Precision-Recall curve')
    disp.ax_.figure.savefig(save_figure_file)

    # Save the model and features in pickle file
    model_and_features_save_file = os.path.join(args.output_dir,
                                                "model_and_features.pkl")
    logging.info(
        f"Saving LR model and features at {model_and_features_save_file}")
    save_in_pickle((lr, feature2i, i2feature), model_and_features_save_file)

    # Save model_config and results
    model_config_file = os.path.join(args.output_dir, "model_config.json")
    results_file = os.path.join(args.output_dir, "results.json")
    logging.info(f"Saving model config at {model_config_file}")
    save_in_json(model_config, model_config_file)
    logging.info(f"Saving results at {results_file}")
    save_in_json(results, results_file)
예제 #12
0
def train(choice, dirname, window):
    #  --------------------------------------------------------------------------------------------------------------------
    dimensionality = BERT_DIMENSIONS  # No need to adjust, unless you want to experiment with custom embeddings
    print("Dimensionality:", dimensionality)

    if choice == 'imm':
        base = '_imm'
    elif choice == 'prewin':
        base = '_prewin'
    style = 'train'
    mlmr_dir = dirname
    seq_length = window  # Adjust to 5 for PreWin and 5, 10, 50 for baseline results

    neg = load_from_hdf5("{}/wiki_LOCATION_{}{}.hdf5".format(
        mlmr_dir, style, base))
    pos = load_from_hdf5("{}/wiki_INSTITUTE_{}{}.hdf5".format(
        mlmr_dir, style, base))
    if path.exists("{}/wiki_EVENT_{}{}.hdf5".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_hdf5("{}/wiki_EVENT_{}{}.hdf5".format(
                mlmr_dir, style, base)))
    if path.exists("{}/wiki_TEAM_{}{}.hdf5".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_hdf5("{}/wiki_TEAM_{}{}.hdf5".format(
                mlmr_dir, style, base)))
    if path.exists("{}/wiki_ARTIFACT_{}{}.hdf5".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_hdf5("{}/wiki_ARTIFACT_{}{}.hdf5".format(
                mlmr_dir, style, base)))

    print("Sequence Length: 2 times ", seq_length)

    A = []
    dep_labels = {u"<u>"}
    for coll in [neg, pos]:
        for l in coll:
            A.append(l)
            dep_labels.update(set(l[0][-seq_length:] + l[2][:seq_length]))

    random.shuffle(A)

    D_L, E_L, D_R, E_R, Y = [], [], [], [], []
    for a in A:
        D_L.append(a[0][-seq_length:])
        E_L.append(a[1][-seq_length:])
        D_R.append(a[2][:seq_length])
        E_R.append(a[3][:seq_length])
        Y.append(a[4])

    print('No of training examples: ', len(D_L))
    dump_to_pickle("dep_labels.pkl", dep_labels)
    dep_labels = load_from_pickle("dep_labels.pkl")
    #  --------------------------------------------------------------------------------------------------------------------
    print("Building sequences...")

    dep_to_index = dict([(w, i) for i, w in enumerate(dep_labels)])

    for d_l, d_r in zip(D_L, D_R):
        for i, w in enumerate(d_l):
            arr = np.zeros(len(dep_labels))
            if w in dep_to_index:
                arr[dep_to_index[w]] = 1
            else:
                arr[dep_to_index[u"<u>"]] = 1
            d_l[i] = arr
        for i, w in enumerate(d_r):
            arr = np.zeros(len(dep_labels))
            if w in dep_to_index:
                arr[dep_to_index[w]] = 1
            else:
                arr[dep_to_index[u"<u>"]] = 1
            d_r[i] = arr

    D_L = np.asarray(D_L)
    D_R = np.asarray(D_R)
    E_L = torch.stack(E_L).detach()
    E_R = torch.stack(E_R).detach()
    Y = np.asarray(Y)

    # convert labels to one-hot format
    num_classes = Y.max() + 1
    one_hot = np.zeros((Y.size, num_classes))
    one_hot[np.arange(Y.size), Y] = 1
    Y = one_hot

    print(u"Done...")
    #  --------------------------------------------------------------------------------------------------------------------
    print(u'Building model...')
    first_input = Input(shape=(seq_length, dimensionality))
    a = LSTM(units=15)(first_input)
    first_output = Dropout(0.2)(a)
    model_left = Model(inputs=first_input, outputs=first_output)

    second_input = Input(shape=(seq_length, len(dep_labels)))
    a = TimeDistributed(Dense(units=15))(second_input)
    b = Dropout(0.2)(a)
    second_output = Flatten()(b)
    dep_left = Model(inputs=second_input, outputs=second_output)

    third_input = Input(shape=(seq_length, dimensionality))
    a = LSTM(units=15, go_backwards=True)(third_input)
    third_output = Dropout(0.2)(a)
    model_right = Model(inputs=third_input, outputs=third_output)

    fourth_input = Input(shape=(seq_length, len(dep_labels)))
    a = TimeDistributed(Dense(units=15))(fourth_input)
    b = Dropout(0.2)(a)
    fourth_output = Flatten()(b)
    dep_right = Model(inputs=fourth_input, outputs=fourth_output)

    a = concatenate([first_output, second_output, third_output, fourth_output])
    b = Dense(10)(a)
    c = Dense(num_classes, activation='softmax')(b)
    merged_model = Model(
        inputs=[first_input, second_input, third_input, fourth_input],
        outputs=c)
    merged_model.compile(loss='categorical_crossentropy',
                         optimizer='adagrad',
                         metrics=['accuracy'])
    print(u"Done...")
    #  --------------------------------------------------------------------------------------------------------------------
    checkpoint = ModelCheckpoint(filepath="lstm.hdf5", verbose=0)
    merged_model.fit([E_L, D_L, E_R, D_R],
                     Y,
                     batch_size=16,
                     epochs=5,
                     callbacks=[checkpoint],
                     verbose=0)
예제 #13
0
def test(choice, dirname, window):
    #  --------------------------------------------------------------------------------------------------------------------
    dimensionality = BERT_DIMENSIONS  # No need to adjust, unless you want to experiment with custom embeddings
    seq_length = 5  # Adjust to 5 for PreWin and 5, 10, 50 for baseline results
    print("Dimensionality:", dimensionality)

    if choice == 'imm':
        base = '_imm'
    elif choice == 'prewin':
        base = '_prewin'
    style = 'test'
    mlmr_dir = dirname
    seq_length = window  # Adjust to 5 for PreWin and 5, 10, 50 for baseline results

    neg = load_from_hdf5("{}/wiki_LOCATION_{}{}.hdf5".format(
        mlmr_dir, style, base))
    pos = load_from_hdf5("{}/wiki_INSTITUTE_{}{}.hdf5".format(
        mlmr_dir, style, base))
    if path.exists("{}/wiki_EVENT_{}{}.hdf5".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_hdf5("{}/wiki_EVENT_{}{}.hdf5".format(
                mlmr_dir, style, base)))
    if path.exists("{}/wiki_TEAM_{}{}.hdf5".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_hdf5("{}/wiki_TEAM_{}{}.hdf5".format(
                mlmr_dir, style, base)))
    if path.exists("{}/wiki_ARTIFACT_{}{}.hdf5".format(mlmr_dir, style, base)):
        pos.extend(
            load_from_hdf5("{}/wiki_ARTIFACT_{}{}.hdf5".format(
                mlmr_dir, style, base)))

    print("Sequence Length: 2 times ", seq_length)

    D_L, E_L, D_R, E_R, Y = [], [], [], [], []
    for a in copy.deepcopy(neg + pos):
        D_L.append(a[0][-seq_length:])
        E_L.append(a[1][-seq_length:])
        D_R.append(a[2][:seq_length])
        E_R.append(a[3][:seq_length])
        Y.append(a[4])

    print('No of test examples: ', len(D_L))
    dep_labels = load_from_pickle("dep_labels.pkl")
    #  --------------------------------------------------------------------------------------------------------------------
    print("Building sequences...")

    dep_to_index = dict([(w, i) for i, w in enumerate(dep_labels)])

    for d_l, d_r in zip(D_L, D_R):
        for i, w in enumerate(d_l):
            arr = np.zeros(len(dep_labels))
            if w in dep_to_index:
                arr[dep_to_index[w]] = 1
            else:
                arr[dep_to_index[u"<u>"]] = 1
            d_l[i] = arr
        for i, w in enumerate(d_r):
            arr = np.zeros(len(dep_labels))
            if w in dep_to_index:
                arr[dep_to_index[w]] = 1
            else:
                arr[dep_to_index[u"<u>"]] = 1
            d_r[i] = arr

    D_L = np.asarray(D_L)
    D_R = np.asarray(D_R)
    E_L = torch.stack(E_L).detach()
    E_R = torch.stack(E_R).detach()
    Y = np.asarray(Y)

    # convert labels to one-hot format
    num_classes = Y.max() + 1
    one_hot = np.zeros((Y.size, num_classes))
    one_hot[np.arange(Y.size), Y] = 1
    Y = one_hot

    print(u"Done...")
    #  --------------------------------------------------------------------------------------------------------------------
    print(u'Building model...')
    first_input = Input(shape=(seq_length, dimensionality))
    a = LSTM(units=15)(first_input)
    first_output = Dropout(0.2)(a)
    model_left = Model(inputs=first_input, outputs=first_output)

    second_input = Input(shape=(seq_length, len(dep_labels)))
    a = TimeDistributed(Dense(units=15))(second_input)
    b = Dropout(0.2)(a)
    second_output = Flatten()(b)
    dep_left = Model(inputs=second_input, outputs=second_output)

    third_input = Input(shape=(seq_length, dimensionality))
    a = LSTM(units=15, go_backwards=True)(third_input)
    third_output = Dropout(0.2)(a)
    model_right = Model(inputs=third_input, outputs=third_output)

    fourth_input = Input(shape=(seq_length, len(dep_labels)))
    a = TimeDistributed(Dense(units=15))(fourth_input)
    b = Dropout(0.2)(a)
    fourth_output = Flatten()(b)
    dep_right = Model(inputs=fourth_input, outputs=fourth_output)

    a = concatenate([first_output, second_output, third_output, fourth_output])
    b = Dense(10)(a)
    c = Dense(num_classes, activation='softmax')(b)
    merged_model = Model(
        inputs=[first_input, second_input, third_input, fourth_input],
        outputs=c)
    merged_model.load_weights("lstm.hdf5")
    merged_model.compile(loss='categorical_crossentropy',
                         optimizer='adagrad',
                         metrics=['accuracy'])
    print(u"Done...")
    #  --------------------------------------------------------------------------------------------------------------------
    predictions = merged_model.predict_on_batch([E_L, D_L, E_R, D_R])
    y_pred = predictions.argmax(axis=1)
    one_hot = np.zeros((y_pred.size, num_classes))
    one_hot[np.arange(y_pred.size), y_pred] = 1
    y_pred = one_hot

    print('Macro-averaged metrics: ',
          precision_recall_fscore_support(Y, y_pred, average='macro'))
    print('Micro-averaged metrics: ',
          precision_recall_fscore_support(Y, y_pred, average='micro'))
예제 #14
0
def load_loss_gradients(n_samples, filename, savedir, relpath=DATA):
    path = relpath + savedir + filename + "_samp=" + str(
        n_samples) + "_lossGrads.pkl"
    return load_from_pickle(path=path)
예제 #15
0
def main():
    # Read all the data instances
    task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle(
        args.data_file)
    data, subtasks_list = get_multitask_instances_for_valid_tasks(
        task_instances_dict, tag_statistics)
    data = add_marker_for_loss_ignore(
        data, 1.0 if args.loss_for_no_consensus else 0.0)

    if args.retrain:
        if args.large_bert:
            model_name = "bert-large-cased"
        elif args.covid_bert:
            model_name = "digitalepidemiologylab/covid-twitter-bert"
        else:
            model_name = "bert-base-cased"

        logging.info("Creating and training the model from '" + model_name +
                     "'")
        # Create the save_directory if not exists
        make_dir_if_not_exists(args.save_directory)

        # Initialize tokenizer and model with pretrained weights
        tokenizer = BertTokenizer.from_pretrained(model_name)
        config = BertConfig.from_pretrained(model_name)
        config.subtasks = subtasks_list
        model = MultiTaskBertForCovidEntityClassification.from_pretrained(
            model_name, config=config)

        # Add new tokens in tokenizer
        new_special_tokens_dict = {
            "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"]
        }
        tokenizer.add_special_tokens(new_special_tokens_dict)

        # Add the new embeddings in the weights
        print("Embeddings type:",
              model.bert.embeddings.word_embeddings.weight.data.type())
        print("Embeddings shape:",
              model.bert.embeddings.word_embeddings.weight.data.size())
        embedding_size = model.bert.embeddings.word_embeddings.weight.size(1)
        new_embeddings = torch.FloatTensor(
            len(new_special_tokens_dict["additional_special_tokens"]),
            embedding_size).uniform_(-0.1, 0.1)
        # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1)
        print("new_embeddings shape:", new_embeddings.size())
        new_embedding_weight = torch.cat(
            (model.bert.embeddings.word_embeddings.weight.data,
             new_embeddings), 0)
        model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight
        print("Embeddings shape:",
              model.bert.embeddings.word_embeddings.weight.data.size())
        # Update model config vocab size
        model.config.vocab_size = model.config.vocab_size + len(
            new_special_tokens_dict["additional_special_tokens"])
    else:
        # Load the tokenizer and model from the save_directory
        tokenizer = BertTokenizer.from_pretrained(args.save_directory)
        model = MultiTaskBertForCovidEntityClassification.from_pretrained(
            args.save_directory)
        # Load from individual state dicts
        for subtask in model.subtasks:
            model.classifiers[subtask].load_state_dict(
                torch.load(
                    os.path.join(args.save_directory,
                                 f"{subtask}_classifier.bin")))
    model.to(device)
    if args.wandb:
        wandb.watch(model)

    # Explicitly move the classifiers to device
    for subtask, classifier in model.classifiers.items():
        classifier.to(device)
    for subtask, classifier in model.context_vectors.items():
        classifier.to(device)

    entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0]
    entity_end_token_id = tokenizer.convert_tokens_to_ids(["</E>"])[0]

    logging.info(
        f"Task dataset for task: {args.task} loaded from {args.data_file}.")

    model_config = dict()
    results = dict()

    # Split the data into train, dev and test and shuffle the train segment
    train_data, dev_data = split_multitask_instances_in_train_dev(data)
    random.shuffle(train_data)  # shuffle happens in-place
    logging.info("Train Data:")
    total_train_size, pos_subtasks_train_size, neg_subtasks_train_size = log_multitask_data_statistics(
        train_data, model.subtasks)
    logging.info("Dev Data:")
    total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics(
        dev_data, model.subtasks)
    #logging.info("Test Data:")
    #total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics(test_data, model.subtasks)
    logging.info("\n")
    model_config["train_data"] = {
        "size": total_train_size,
        "pos": pos_subtasks_train_size,
        "neg": neg_subtasks_train_size
    }
    model_config["dev_data"] = {
        "size": total_dev_size,
        "pos": pos_subtasks_dev_size,
        "neg": neg_subtasks_dev_size
    }
    #model_config["test_data"] = {"size":total_test_size, "pos":pos_subtasks_test_size, "neg":neg_subtasks_test_size}

    # Extract subtasks data for dev and test
    train_subtasks_data = split_data_based_on_subtasks(train_data,
                                                       model.subtasks)
    dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks)
    #test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks)

    # Load the instances into pytorch dataset
    train_dataset = COVID19TaskDataset(train_data)
    dev_dataset = COVID19TaskDataset(dev_data)
    #test_dataset = COVID19TaskDataset(test_data)
    logging.info("Loaded the datasets into Pytorch datasets")

    tokenize_collator = TokenizeCollator(tokenizer, model.subtasks,
                                         entity_start_token_id,
                                         entity_end_token_id)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=POSSIBLE_BATCH_SIZE,
                                  shuffle=True,
                                  num_workers=0,
                                  collate_fn=tokenize_collator)
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=POSSIBLE_BATCH_SIZE,
                                shuffle=False,
                                num_workers=0,
                                collate_fn=tokenize_collator)
    #test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator)
    logging.info("Created train and test dataloaders with batch aggregation")

    # Only retrain if needed
    if args.retrain:
        optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
        logging.info("Created model optimizer")
        #if args.sentence_level_classify:
        #    args.n_epochs += 2
        epochs = args.n_epochs

        # Total number of training steps is [number of batches] x [number of epochs].
        total_steps = len(train_dataloader) * epochs

        # Create the learning rate scheduler.
        # NOTE: num_warmup_steps = 0 is the Default value in run_glue.py
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps)
        # We'll store a number of quantities such as training and validation loss, validation accuracy, and timings.
        training_stats = []
        print("\n\n\n ====== Training for task", args.task,
              "=============\n\n\n")
        logging.info(f"Initiating training loop for {args.n_epochs} epochs...")
        print(model.state_dict().keys())

        total_start_time = time.time()

        # Find the accumulation steps
        accumulation_steps = args.batch_size / POSSIBLE_BATCH_SIZE

        # Dev validation trajectory
        epoch_train_loss = list()
        train_subtasks_validation_statistics = {
            subtask: list()
            for subtask in model.subtasks
        }
        dev_subtasks_validation_statistics = {
            subtask: list()
            for subtask in model.subtasks
        }
        best_dev_F1 = 0
        for epoch in range(epochs):

            logging.info(f"Initiating Epoch {epoch+1}:")

            # Reset the total loss for each epoch.
            total_train_loss = 0
            train_loss_trajectory = list()

            # Reset timer for each epoch
            start_time = time.time()
            model.train()

            dev_log_frequency = 5
            n_steps = len(train_dataloader)
            dev_steps = int(n_steps / dev_log_frequency)
            for step, batch in enumerate(train_dataloader):
                # Upload labels of each subtask to device
                for subtask in model.subtasks:
                    subtask_labels = batch["gold_labels"][subtask]
                    subtask_labels = subtask_labels.to(device)
                    batch["gold_labels"][subtask] = subtask_labels
                    batch["label_ignore_loss"][subtask] = batch[
                        "label_ignore_loss"][subtask].to(device)

                # Forward
                input_dict = {
                    "input_ids":
                    batch["input_ids"].to(device),
                    "entity_start_positions":
                    batch["entity_start_positions"].to(device),
                    "entity_end_positions":
                    batch["entity_end_positions"].to(device),
                    "labels":
                    batch["gold_labels"],
                    "label_weight":
                    batch["label_ignore_loss"]
                }

                input_ids = batch["input_ids"]
                entity_start_positions = batch["entity_start_positions"]
                gold_labels = batch["gold_labels"]
                batch_data = batch["batch_data"]
                loss, logits = model(**input_dict)

                # Accumulate loss
                total_train_loss += loss.item()

                # Backward: compute gradients
                loss.backward()

                if (step + 1) % accumulation_steps == 0:
                    # Calculate elapsed time in minutes and print loss on the tqdm bar
                    elapsed = format_time(time.time() - start_time)
                    avg_train_loss = total_train_loss / (step + 1)

                    # keep track of changing avg_train_loss
                    train_loss_trajectory.append(avg_train_loss)
                    if (step + 1) % (accumulation_steps * 20) == 0:
                        print(
                            f"Epoch:{epoch+1}|Batch:{step}/{len(train_dataloader)}|Time:{elapsed}|Avg. Loss:{avg_train_loss:.4f}|Loss:{loss.item():.4f}"
                        )

                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                    optimizer.step()

                    # Clean the model's previous gradients
                    model.zero_grad()
                    scheduler.step()

            # Calculate the average loss over all of the batches.
            avg_train_loss = total_train_loss / len(train_dataloader)

            # Perform validation with the model and log the performance
            print("\n")
            logging.info("Running Validation...")
            # Put the model in evaluation mode--the dropout layers behave differently during evaluation.
            model.eval()
            dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(
                dev_dataloader, model, device, args.task + "_dev", True)

            wandb_log_dict = {"Train Loss": avg_train_loss}
            print("Dev Set:")
            collect_TP_FP_FN = {"TP": 0, "FP": 0, "FN": 0}
            for subtask in model.subtasks:
                dev_subtask_data = dev_subtasks_data[subtask]
                dev_subtask_prediction_scores = dev_prediction_scores[subtask]
                dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(
                    dev_subtask_data,
                    dev_subtask_prediction_scores,
                    task=subtask)
                if subtask not in IGNORE_TASKS:
                    collect_TP_FP_FN["TP"] += dev_TP
                    collect_TP_FP_FN["FP"] += dev_FP
                    collect_TP_FP_FN["FN"] += dev_FN
                else:
                    print("IGNORE: ", end="")

                print(
                    f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}"
                )
                dev_subtasks_validation_statistics[subtask].append(
                    (epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P,
                     dev_R, dev_TP, dev_FP, dev_FN))

                wandb_log_dict["Dev_ " + subtask + "_F1"] = dev_F1
                wandb_log_dict["Dev_ " + subtask + "_P"] = dev_P
                wandb_log_dict["Dev_ " + subtask + "_R"] = dev_R

            dev_macro_P = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] +
                                                    collect_TP_FP_FN["FP"])
            dev_macro_R = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] +
                                                    collect_TP_FP_FN["FN"])
            dev_macro_F1 = (2 * dev_macro_P * dev_macro_R) / (dev_macro_P +
                                                              dev_macro_R)
            print(collect_TP_FP_FN)
            print("dev_macro_P:", dev_macro_P, "\ndev_macro_R:", dev_macro_R,
                  "\ndev_macro_F1:", dev_macro_F1, "\n")
            wandb_log_dict["Dev_macro_F1"] = dev_macro_F1
            wandb_log_dict["Dev_macro_P"] = dev_macro_P
            wandb_log_dict["Dev_macro_R"] = dev_macro_R

            if args.wandb:
                wandb.log(wandb_log_dict)

            if dev_macro_F1 > best_dev_F1:
                best_dev_F1 = dev_macro_F1
                print("NEW BEST F1:", best_dev_F1, " Saving checkpoint now.")
                torch.save(model.state_dict(), args.output_dir + "/ckpt.pth")
                #print(model.state_dict().keys())
                #model.save_pretrained(args.save_directory)
            model.train()

            training_time = format_time(time.time() - start_time)

            # Record all statistics from this epoch.
            training_stats.append({
                'epoch': epoch + 1,
                'Training Loss': avg_train_loss,
                'Training Time': training_time
            })

            # Save the loss trajectory
            epoch_train_loss.append(train_loss_trajectory)
            print("\n\n")

        logging.info(
            f"Training complete with total Train time:{format_time(time.time()- total_start_time)}"
        )
        log_list(training_stats)

        model.load_state_dict(torch.load(args.output_dir + "/ckpt.pth"))
        model.eval()
        # Save the model and the Tokenizer here:
        #logging.info(f"Saving the model and tokenizer in {args.save_directory}")
        #model.save_pretrained(args.save_directory)

        # Save each subtask classifiers weights to individual state dicts
        #for subtask, classifier in model.classifiers.items():
        #    classifier_save_file = os.path.join(args.save_directory, f"{subtask}_classifier.bin")
        #    logging.info(f"Saving the model's {subtask} classifier weights at {classifier_save_file}")
        #    torch.save(classifier.state_dict(), classifier_save_file)
        #tokenizer.save_pretrained(args.save_directory)

        # Plot the train loss trajectory in a plot
        #train_loss_trajectory_plot_file = os.path.join(args.output_dir, "train_loss_trajectory.png")
        #logging.info(f"Saving the Train loss trajectory at {train_loss_trajectory_plot_file}")
        #print(epoch_train_loss)

        # TODO: Plot the validation performance
        # Save dev_subtasks_validation_statistics
    else:
        raise
        logging.info("No training needed. Directly going to evaluation!")

    # Save the model name in the model_config file
    model_config["model"] = "MultiTaskBertForCovidEntityClassification"
    model_config["epochs"] = args.n_epochs

    # Find best threshold for each subtask based on dev set performance
    thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    #test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task, True)
    dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(
        dev_dataloader, model, device, args.task + "_dev", True)

    best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks}
    best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks}
    best_test_F1s = {subtask: 0.0 for subtask in model.subtasks}
    best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks}
    #test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks}
    dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks}

    for subtask in model.subtasks:
        dev_subtask_data = dev_subtasks_data[subtask]
        dev_subtask_prediction_scores = dev_prediction_scores[subtask]
        for t in thresholds:
            dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(
                dev_subtask_data,
                dev_subtask_prediction_scores,
                THRESHOLD=t,
                task=subtask)
            dev_subtasks_t_F1_P_Rs[subtask].append(
                (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP,
                 dev_FN))
            if dev_F1 > best_dev_F1s[subtask]:
                best_dev_thresholds[subtask] = t
                best_dev_F1s[subtask] = dev_F1

        logging.info(f"Subtask:{subtask:>15}")
        log_list(dev_subtasks_t_F1_P_Rs[subtask])
        logging.info(
            f"Best Dev Threshold for subtask: {best_dev_thresholds[subtask]}\t Best dev F1: {best_dev_F1s[subtask]}"
        )

    # Save the best dev threshold and dev_F1 in results dict
    results["best_dev_threshold"] = best_dev_thresholds
    results["best_dev_F1s"] = best_dev_F1s
    results["dev_t_F1_P_Rs"] = dev_subtasks_t_F1_P_Rs

    # Evaluate on Test
    logging.info("Testing on eval dataset")
    predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset(
        dev_dataloader, model, device, args.task)

    # Test
    for subtask in model.subtasks:
        logging.info(f"\nTesting the trained classifier on subtask: {subtask}")

        results[subtask] = dict()
        cm = metrics.confusion_matrix(gold_labels[subtask],
                                      predicted_labels[subtask])
        classification_report = metrics.classification_report(
            gold_labels[subtask], predicted_labels[subtask], output_dict=True)
        logging.info(cm)
        logging.info(
            metrics.classification_report(gold_labels[subtask],
                                          predicted_labels[subtask]))
        results[subtask]["CM"] = cm.tolist(
        )  # Storing it as list of lists instead of numpy.ndarray
        results[subtask]["Classification Report"] = classification_report

        # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation)
        EM_score, F1_score, total = get_raw_scores(dev_subtasks_data[subtask],
                                                   prediction_scores[subtask])
        logging.info("Word overlap based SQuAD evaluation style metrics:")
        logging.info(f"Total number of cases: {total}")
        logging.info(f"EM_score: {EM_score}")
        logging.info(f"F1_score: {F1_score}")
        results[subtask]["SQuAD_EM"] = EM_score
        results[subtask]["SQuAD_F1"] = F1_score
        results[subtask]["SQuAD_total"] = total
        pos_EM_score, pos_F1_score, pos_total = get_raw_scores(
            dev_subtasks_data[subtask],
            prediction_scores[subtask],
            positive_only=True)
        logging.info(f"Total number of Positive cases: {pos_total}")
        logging.info(f"Pos. EM_score: {pos_EM_score}")
        logging.info(f"Pos. F1_score: {pos_F1_score}")
        results[subtask]["SQuAD_Pos. EM"] = pos_EM_score
        results[subtask]["SQuAD_Pos. F1"] = pos_F1_score
        results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total

        # New evaluation suggested by Alan
        F1, P, R, TP, FP, FN = get_TP_FP_FN(
            dev_subtasks_data[subtask],
            prediction_scores[subtask],
            THRESHOLD=best_dev_thresholds[subtask],
            task=subtask)
        logging.info("New evaluation scores:")
        logging.info(f"F1: {F1}")
        logging.info(f"Precision: {P}")
        logging.info(f"Recall: {R}")
        logging.info(f"True Positive: {TP}")
        logging.info(f"False Positive: {FP}")
        logging.info(f"False Negative: {FN}")
        results[subtask]["F1"] = F1
        results[subtask]["P"] = P
        results[subtask]["R"] = R
        results[subtask]["TP"] = TP
        results[subtask]["FP"] = FP
        results[subtask]["FN"] = FN
        N = TP + FN
        results[subtask]["N"] = N

    # Save model_config and results
    model_config_file = os.path.join(args.output_dir, "model_config.json")
    results_file = os.path.join(args.output_dir, "results.json")
    logging.info(f"Saving model config at {model_config_file}")
    save_in_json(model_config, model_config_file)
    logging.info(f"Saving results at {results_file}")
    save_in_json(results, results_file)
예제 #16
0
def predict(vedio_path, st, base_dir, ver):
    data_dir = os.path.join(base_dir, 'data')
    known_faces_dir = os.path.join(data_dir, 'known_faces' + ver)
    full_vedios_dir = os.path.join(data_dir, 'full_vedio')
    pkl_files_dir = os.path.join(data_dir, "pkl_files")

    known_face_names = load_from_pickle(
        os.path.join(pkl_files_dir, "known_face_names" + ver + ".pkl"))
    known_face_encodings = load_from_pickle(
        os.path.join(pkl_files_dir, "known_face_encodings" + ver + ".pkl"))

    st.video(vedio_path)
    video_capture = cv2.VideoCapture(vedio_path)
    fps = int(video_capture.get(cv2.CAP_PROP_FPS))

    vst = st.number_input("time(secs) to start the vedio from",
                          min_value=0,
                          max_value=5 * 60,
                          value=15,
                          step=1)  ###
    vet = st.number_input("time(secs) to exit",
                          min_value=0,
                          max_value=5 * 60,
                          value=25,
                          step=1)
    num_frames_per_sec = st.number_input(
        "number of frames to process per second",
        min_value=1,
        max_value=fps,
        value=1,
        step=1)

    show = st.radio("show vedio", ["no", "yes"], index=1)  ###
    if show == "yes":
        i = -1
        # make empty containers to repalce outputs
        image_location = st.empty()
        image_text = st.empty()
        time_text = st.empty()
        while video_capture.isOpened():
            i += 1
            if int(i / fps) < vst:
                ret, frame = video_capture.read()
                continue

            if int(i / fps) > vet:
                break

            # Grab a single frame of video
            ret, frame = video_capture.read()

            # Convert the image from BGR color (which OpenCV uses) to RGB color (which face_recognition uses)
            rgb_frame = frame[:, :, ::-1]

            # process
            if i % int(fps / num_frames_per_sec) == 0:
                face_locations = fr_fl(rgb_frame, **fr_fl_dict)
                if not len(face_locations) == 1:
                    image_location.image(rgb_frame)
                    time_text.text(f"time(sec): {int(i/fps)}")
                    image_text.text("no face")
                    continue

                face_encoding = fr_fe(rgb_frame,
                                      known_face_locations=face_locations,
                                      **fr_fe_dict)[0]
                face_distances = face_recognition.face_distance(
                    known_face_encodings, face_encoding)
                best_match_index = np.argmin(face_distances)
                name = known_face_names[best_match_index]

                top, right, bottom, left = face_locations[0]
                # Draw a box around the face
                cv2.rectangle(frame, (left, top), (right, bottom), (0, 0, 255),
                              2)

                # Draw a label with a name below the face
                cv2.rectangle(frame, (left, bottom - 35), (right, bottom),
                              (0, 0, 255), cv2.FILLED)
                font = cv2.FONT_HERSHEY_DUPLEX
                cv2.putText(frame, name, (left + 6, bottom - 6), font, 1.0,
                            (255, 255, 255), 1)

                image_location.image(frame[:, :, ::-1])
                image_text.text(f"predicted name: {name}")
                time_text.text(f"time(sec): {int(i/fps)}")

            # Hit 'q' on the keyboard to quit!
            if cv2.waitKey(10) & 0xFF == ord('q'):
                break

        # Release handle to the webcam
        video_capture.release()
        cv2.destroyAllWindows()

        # clear outputs
        image_location.empty()
        time_text.empty()
        image_text.empty()
예제 #17
0
def main_try(args):
    task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle(
        args.data_file)
    data, subtasks_list = get_multitask_instances_for_valid_tasks(
        task_instances_dict, tag_statistics)
    data = add_marker_for_loss_ignore(data, 1.0 if False else 0.0)
    model_name = "digitalepidemiologylab/covid-twitter-bert"
    print("\n\n===========\n\n", subtasks_list, "\n\n===========\n\n")

    tokenizer = BertTokenizer.from_pretrained(model_name)
    config = BertConfig.from_pretrained(model_name)
    config.subtasks = subtasks_list
    model = MultiTaskBertForCovidEntityClassification.from_pretrained(
        model_name, config=config)

    # Add new tokens in tokenizer
    new_special_tokens_dict = {
        "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"]
    }
    tokenizer.add_special_tokens(new_special_tokens_dict)

    # Add the new embeddings in the weights
    print("Embeddings type:",
          model.bert.embeddings.word_embeddings.weight.data.type())
    print("Embeddings shape:",
          model.bert.embeddings.word_embeddings.weight.data.size())
    embedding_size = model.bert.embeddings.word_embeddings.weight.size(1)
    new_embeddings = torch.FloatTensor(
        len(new_special_tokens_dict["additional_special_tokens"]),
        embedding_size).uniform_(-0.1, 0.1)
    # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1)
    print("new_embeddings shape:", new_embeddings.size())
    new_embedding_weight = torch.cat(
        (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0)
    model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight
    print("Embeddings shape:",
          model.bert.embeddings.word_embeddings.weight.data.size())
    # Update model config vocab size
    model.config.vocab_size = model.config.vocab_size + len(
        new_special_tokens_dict["additional_special_tokens"])

    model.load_state_dict(torch.load(args.save_path + "ckpt.pth"))
    print("loaded_model")
    model.to("cuda")

    entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0]
    entity_end_token_id = tokenizer.convert_tokens_to_ids(["</E>"])[0]

    print(f"Task dataset for task: {args.task} loaded from {args.data_file}.")

    model_config = dict()
    results = dict()

    # Split the data into train, dev and test and shuffle the train segment
    dev_data = data
    print("Dev Data:")
    total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics(
        dev_data, model.subtasks)
    model_config["dev_data"] = {
        "size": total_dev_size,
        "pos": pos_subtasks_dev_size,
        "neg": neg_subtasks_dev_size
    }

    # Extract subtasks data for dev and test
    dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks)

    # Load the instances into pytorch dataset
    dev_dataset = COVID19TaskDataset(dev_data)

    tokenize_collator = TokenizeCollator(tokenizer, model.subtasks,
                                         entity_start_token_id,
                                         entity_end_token_id)
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=POSSIBLE_BATCH_SIZE,
                                collate_fn=tokenize_collator)
    print("Created dev dataloaders with batch aggregation")

    dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(
        dev_dataloader, model, device, args.task + "_dev", True)
    # print(dev_predicted_labels['age'][0], dev_prediction_scores['age'][0], dev_gold_labels['age'][0])
    assert dev_predicted_labels.keys() == dev_prediction_scores.keys()
    assert dev_predicted_labels.keys() == dev_gold_labels.keys()

    for st in dev_gold_labels.keys():
        print(st, ":", len(dev_predicted_labels[st]),
              len(dev_prediction_scores[st]), len(dev_gold_labels[st]))

    dev_threshold = json.load(open(args.save_path + "/results.json",
                                   "r"))['best_dev_threshold']
    print(dev_threshold)

    # [print(k, v) for k,v in get_chunk_tweet_id(dev_subtasks_data['age'], dev_prediction_scores['age'], dev_threshold['age']).items()]
    dev_pred_chunks = {}
    for subtask in subtasks_list:
        if subtask not in IGNORE_TASKS:
            dev_pred_chunks[subtask] = get_chunk_tweet_id(
                dev_subtasks_data[subtask], dev_prediction_scores[subtask],
                dev_threshold[subtask])

    json_save_predicts(dev_pred_chunks,
                       args.output_dir + "/" + args.task + ".json",
                       {k: v
                        for k, v in question_keys_and_tags})

    collect_TP_FP_FN = {"TP": 0.0001, "FP": 0.0001, "FN": 0.0001}
    for subtask in model.subtasks:
        dev_subtask_data = dev_subtasks_data[subtask]
        dev_subtask_prediction_scores = dev_prediction_scores[subtask]
        dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(
            dev_subtask_data,
            dev_subtask_prediction_scores,
            dev_threshold[subtask],
            task=subtask)
        if subtask not in IGNORE_TASKS:
            collect_TP_FP_FN["TP"] += dev_TP
            collect_TP_FP_FN["FP"] += dev_FP
            collect_TP_FP_FN["FN"] += dev_FN
        else:
            print("IGNORE: ", end="")

        print(
            f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}"
        )

    dev_macro_P = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] +
                                            collect_TP_FP_FN["FP"])
    dev_macro_R = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] +
                                            collect_TP_FP_FN["FN"])
    dev_macro_F1 = (2 * dev_macro_P * dev_macro_R) / (dev_macro_P +
                                                      dev_macro_R)
    print(collect_TP_FP_FN)
    print("dev_macro_P:", dev_macro_P, "\ndev_macro_R:", dev_macro_R,
          "\ndev_macro_F1:", dev_macro_F1, "\n")
예제 #18
0
def main():

    task_instances_dict = load_from_pickle(args.data_file)
    data = extract_instances_for_current_subtask(task_instances_dict,
                                                 args.sub_task)
    logging.info(
        f"Task dataset for task: {args.task} loaded from {args.data_file}.")

    model_config = dict()
    results = dict()

    # Split the data into train, dev and test and shuffle the train segment
    train_data, dev_data, test_data = split_instances_in_train_dev_test(data)
    random.shuffle(train_data)  # shuffle happens in-place
    logging.info("Train Data:")
    total_train_size, pos_train_size, neg_train_size = log_data_statistics(
        train_data)
    logging.info("Dev Data:")
    total_dev_size, pos_dev_size, neg_dev_size = log_data_statistics(dev_data)
    logging.info("Test Data:")
    total_test_size, pos_test_size, neg_test_size = log_data_statistics(
        test_data)
    # logging.info("\n")
    model_config["train_data"] = {
        "size": total_train_size,
        "pos": pos_train_size,
        "neg": neg_train_size
    }
    model_config["dev_data"] = {
        "size": total_dev_size,
        "pos": pos_dev_size,
        "neg": neg_dev_size
    }
    model_config["test_data"] = {
        "size": total_test_size,
        "pos": pos_test_size,
        "neg": neg_test_size
    }

    # Extract n-gram features from the train data
    # Returned ngrams will be dict of dict
    # TODO: update the feature extractor
    feature2i, i2feature = create_ngram_features_from(train_data)
    logging.info(
        f"Total number of features extracted from train = {len(feature2i)}, {len(i2feature)}"
    )
    model_config["features"] = {"size": len(feature2i)}

    # Extract Feature vectors and labels from train and test data
    train_X, train_Y = convert_data_to_feature_vector_and_labels(
        train_data, feature2i)
    dev_X, dev_Y = convert_data_to_feature_vector_and_labels(
        dev_data, feature2i)
    test_X, test_Y = convert_data_to_feature_vector_and_labels(
        test_data, feature2i)
    logging.info(
        f"Train Data Features = {train_X.shape} and Labels = {len(train_Y)}")
    logging.info(
        f"Dev Data Features = {dev_X.shape} and Labels = {len(dev_Y)}")
    logging.info(
        f"Test Data Features = {test_X.shape} and Labels = {len(test_Y)}")
    model_config["train_data"]["features_shape"] = train_X.shape
    model_config["train_data"]["labels_shape"] = len(train_Y)
    model_config["dev_data"]["features_shape"] = dev_X.shape
    model_config["dev_data"]["labels_shape"] = len(dev_Y)
    model_config["test_data"]["features_shape"] = test_X.shape
    model_config["test_data"]["labels_shape"] = len(test_Y)

    # Train logistic regression classifier
    logging.info("Training the Logistic Regression classifier")
    lr = LogisticRegression(solver='lbfgs', max_iter=1000)
    lr.fit(train_X, train_Y)
    model_config["model"] = "LogisticRegression(solver='lbfgs')"

    # # Find best threshold based on dev set performance
    # thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    #
    # dev_prediction_probs = lr.predict_proba(dev_X)[:, 1]
    # dev_t_F1_P_Rs = list()
    # best_threshold_based_on_F1 = 0.5
    # best_dev_F1 = 0.0
    # for t in thresholds:
    # 	dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(dev_data, dev_prediction_probs, THRESHOLD=t)
    # 	dev_t_F1_P_Rs.append((t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN))
    # 	if dev_F1 > best_dev_F1:
    # 		best_threshold_based_on_F1 = t
    # 		best_dev_F1 = dev_F1
    # # log_list(dev_t_F1_P_Rs)
    # # logging.info(f"Best Threshold: {best_threshold_based_on_F1}\t Best dev F1: {best_dev_F1}")
    # # Save the best dev threshold and dev_F1 in results dict
    # results["best_dev_threshold"] = best_threshold_based_on_F1
    # results["best_dev_F1"] = best_dev_F1
    # results["dev_t_F1_P_Rs"] = dev_t_F1_P_Rs
    # # y_pred = (clf.predict_proba(X_test)[:,1] >= 0.3).astype(bool)

    # Test
    logging.info("Testing the trained classifier")
    predictions = lr.predict(test_X)
    probs = lr.predict_proba(test_X)
    test_Y_prediction_probs = probs[:, 1]
    cm = metrics.confusion_matrix(test_Y, predictions)
    classification_report = metrics.classification_report(test_Y,
                                                          predictions,
                                                          output_dict=True)
    logging.info(cm)
    logging.info(metrics.classification_report(test_Y, predictions))
    results["CM"] = cm.tolist(
    )  # Storing it as list of lists instead of numpy.ndarray
    results["Classification Report"] = classification_report

    # evaluation script
    F1, P, R, TP, FP, FN = get_TP_FP_FN(test_data,
                                        test_Y_prediction_probs,
                                        THRESHOLD=0.5)
    logging.info("New evaluation scores:")
    logging.info(f"F1: {F1}")
    logging.info(f"Precision: {P}")
    logging.info(f"Recall: {R}")
    logging.info(f"True Positive: {TP}")
    logging.info(f"False Positive: {FP}")
    logging.info(f"False Negative: {FN}")
    results["F1"] = F1
    results["P"] = P
    results["R"] = R
    results["TP"] = TP
    results["FP"] = FP
    results["FN"] = FN
    N = TP + FN
    results["N"] = N

    # Save the model and features in pickle file
    model_and_features_save_file = os.path.join(args.output_dir,
                                                "model_and_features.pkl")
    logging.info(
        f"Saving LR model and features at {model_and_features_save_file}")
    save_in_pickle((lr, feature2i, i2feature), model_and_features_save_file)

    # Save model_config and results
    model_config_file = os.path.join(args.output_dir, "model_config.json")
    results_file = os.path.join(args.output_dir, "results.json")
    logging.info(f"Saving model config at {model_config_file}")
    save_in_json(model_config, model_config_file)
    logging.info(f"Saving results at {results_file}")
    save_in_json(results, results_file)