def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) if args.retrain: logging.info("Creating and training the model from 'bert-base-cased' ") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained('bert-base-cased') config = BertConfig.from_pretrained('bert-base-cased') config.subtasks = subtasks_list # print(config) model = MultiTaskBertForCovidEntityClassification.from_pretrained( 'bert-base-cased', config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } # new_special_tokens_dict = {"additional_special_tokens": ["<E>", "</E>"]} tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # print(model.state_dict().keys()) # TODO save and load the subtask classifier weights separately # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) # print(model.config) # exit() model.to(device) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data, test_data = split_multitask_instances_in_train_dev_test( data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_subtasks_train_size, neg_subtasks_train_size = log_multitask_data_statistics( train_data, model.subtasks) logging.info("Dev Data:") total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics( dev_data, model.subtasks) logging.info("Test Data:") total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics( test_data, model.subtasks) logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_subtasks_train_size, "neg": neg_subtasks_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_subtasks_dev_size, "neg": neg_subtasks_dev_size } model_config["test_data"] = { "size": total_test_size, "pos": pos_subtasks_test_size, "neg": neg_subtasks_test_size } # Extract subtasks data for dev and test dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset train_dataset = COVID19TaskDataset(train_data) dev_dataset = COVID19TaskDataset(dev_data) test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id) train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Only retrain if needed if args.retrain: print('DO RETRAIN') ################################################################################################## # NOTE: Training Tutorial Reference # https://mccormickml.com/2019/07/22/BERT-fine-tuning/#41-bertforsequenceclassification ################################################################################################## # Create an optimizer training schedule for the BERT text classification model # NOTE: AdamW is a class from the huggingface library (as opposed to pytorch) # I believe the 'W' stands for 'Weight Decay fix" # Recommended Schedule for BERT fine-tuning as per the paper # Batch size: 16, 32 # Learning rate (Adam): 5e-5, 3e-5, 2e-5 # Number of epochs: 2, 3, 4 optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) logging.info("Created model optimizer") # Number of training epochs. The BERT authors recommend between 2 and 4. # We chose to run for 4, but we'll see later that this may be over-fitting the # training data. epochs = args.n_epochs # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # NOTE: num_warmup_steps = 0 is the Default value in run_glue.py scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # We'll store a number of quantities such as training and validation loss, # validation accuracy, and timings. training_stats = [] logging.info(f"Initiating training loop for {args.n_epochs} epochs...") # Measure the total training time for the whole run. total_start_time = time.time() # Find the accumulation steps accumulation_steps = args.batch_size / POSSIBLE_BATCH_SIZE # Loss trajectory for epochs epoch_train_loss = list() # Dev validation trajectory dev_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } for epoch in range(epochs): pbar = tqdm(train_dataloader) logging.info(f"Initiating Epoch {epoch+1}:") # Reset the total loss for each epoch. total_train_loss = 0 train_loss_trajectory = list() # Reset timer for each epoch start_time = time.time() model.train() dev_log_frequency = 5 n_steps = len(train_dataloader) dev_steps = int(n_steps / dev_log_frequency) for step, batch in enumerate(pbar): # Upload labels of each subtask to device for subtask in model.subtasks: subtask_labels = batch["gold_labels"][subtask] subtask_labels = subtask_labels.to(device) # print("HAHAHAHAH:", subtask_labels.is_cuda) batch["gold_labels"][subtask] = subtask_labels # print("HAHAHAHAH:", batch["gold_labels"][subtask].is_cuda) # Forward input_dict = { "input_ids": batch["input_ids"].to(device), "entity_start_positions": batch["entity_start_positions"].to(device), "labels": batch["gold_labels"] } input_ids = batch["input_ids"] entity_start_positions = batch["entity_start_positions"] gold_labels = batch["gold_labels"] batch_data = batch["batch_data"] loss, logits = model(**input_dict) # loss = loss / accumulation_steps # Accumulate loss total_train_loss += loss.item() # Backward: compute gradients loss.backward() if (step + 1) % accumulation_steps == 0: # Calculate elapsed time in minutes and print loss on the tqdm bar elapsed = format_time(time.time() - start_time) avg_train_loss = total_train_loss / (step + 1) # keep track of changing avg_train_loss train_loss_trajectory.append(avg_train_loss) pbar.set_description( f"Epoch:{epoch+1}|Batch:{step}/{len(train_dataloader)}|Time:{elapsed}|Avg. Loss:{avg_train_loss:.4f}|Loss:{loss.item():.4f}" ) # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # Update parameters optimizer.step() # Clean the model's previous gradients model.zero_grad() # Reset gradients tensors # Update the learning rate. scheduler.step() pbar.update() if (step + 1) % dev_steps == 0: # Perform validation with the model and log the performance logging.info("Running Validation...") # Put the model in evaluation mode--the dropout layers behave differently # during evaluation. model.eval() dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[ subtask] dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores) logging.info( f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}" ) dev_subtasks_validation_statistics[subtask].append( (epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) # logging.info("DEBUG:Validation on Test") # dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task + "_dev", True) # for subtask in model.subtasks: # dev_subtask_data = test_subtasks_data[subtask] # dev_subtask_prediction_scores = dev_prediction_scores[subtask] # dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN(dev_subtask_data, dev_subtask_prediction_scores) # logging.info(f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}") # dev_subtasks_validation_statistics[subtask].append((epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) # Put the model back in train setting model.train() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) training_time = format_time(time.time() - start_time) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch + 1, 'Training Loss': avg_train_loss, 'Training Time': training_time }) # Save the loss trajectory epoch_train_loss.append(train_loss_trajectory) logging.info( f"Training complete with total Train time:{format_time(time.time()- total_start_time)}" ) log_list(training_stats) # Save the model and the Tokenizer here: logging.info( f"Saving the model and tokenizer in {args.save_directory}") model.save_pretrained(args.save_directory) # Save each subtask classifiers weights to individual state dicts for subtask, classifier in model.classifiers.items(): classifier_save_file = os.path.join(args.save_directory, f"{subtask}_classifier.bin") logging.info( f"Saving the model's {subtask} classifier weights at {classifier_save_file}" ) torch.save(classifier.state_dict(), classifier_save_file) tokenizer.save_pretrained(args.save_directory) # Plot the train loss trajectory in a plot train_loss_trajectory_plot_file = os.path.join( args.output_dir, "train_loss_trajectory.png") logging.info( f"Saving the Train loss trajectory at {train_loss_trajectory_plot_file}" ) plot_train_loss(epoch_train_loss, train_loss_trajectory_plot_file) # TODO: Plot the validation performance # Save dev_subtasks_validation_statistics else: logging.info("No training needed. Directly going to evaluation!") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task, True) dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} # for subtask in model.subtasks: # test_subtask_data = test_subtasks_data[subtask] # test_subtask_prediction_scores = test_prediction_scores[subtask] # for t in thresholds: # test_F1, test_P, test_R, test_TP, test_FP, test_FN = get_TP_FP_FN(test_subtask_data, test_subtask_prediction_scores, THRESHOLD=t) # test_subtasks_t_F1_P_Rs[subtask].append((t, test_F1, test_P, test_R, test_TP + test_FN, test_TP, test_FP, test_FN)) # if test_F1 > best_test_F1s[subtask]: # best_test_thresholds[subtask] = t # best_test_F1s[subtask] = test_F1 # logging.info(f"Subtask:{subtask:>15}") # log_list(test_subtasks_t_F1_P_Rs[subtask]) # logging.info(f"Best Test Threshold for subtask: {best_test_thresholds[subtask]}\t Best test F1: {best_test_F1s[subtask]}") for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] for t in thresholds: dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, THRESHOLD=t) dev_subtasks_t_F1_P_Rs[subtask].append( (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) if dev_F1 > best_dev_F1s[subtask]: best_dev_thresholds[subtask] = t best_dev_F1s[subtask] = dev_F1 logging.info(f"Subtask:{subtask:>15}") log_list(dev_subtasks_t_F1_P_Rs[subtask]) logging.info( f"Best Dev Threshold for subtask: {best_dev_thresholds[subtask]}\t Best dev F1: {best_dev_F1s[subtask]}" ) # Save the best dev threshold and dev_F1 in results dict results["best_dev_threshold"] = best_dev_thresholds results["best_dev_F1s"] = best_dev_F1s results["dev_t_F1_P_Rs"] = dev_subtasks_t_F1_P_Rs # Evaluate on Test logging.info("Testing on test dataset") # test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task) predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"Testing the trained classifier on subtask: {subtask}") # print(len(test_dataloader)) # print(len(prediction_scores[subtask])) # print(len(test_subtasks_data[subtask])) results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(test_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( test_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( test_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask]) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # # Top predictions in the Test case # prediction_scores[subtask] = np.array(prediction_scores[subtask]) # sorted_prediction_ids = np.argsort(-prediction_scores[subtask]) # K = 200 # logging.info("Top {} predictions:".format(K)) # logging.info("\t".join(["Tweet", "BERT model input", "candidate chunk", "prediction score", "predicted label", "gold label", "gold chunks"])) # for i in range(K): # instance_id = sorted_prediction_ids[i] # # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label # tweet = test_subtasks_data[subtask][instance_id][0].replace("\n", " ") # chunk = test_subtasks_data[subtask][instance_id][1] # tokenized_tweet_with_masked_chunk = test_subtasks_data[subtask][instance_id][6] # if chunk in ["AUTHOR OF THE TWEET", "NEAR AUTHOR OF THE TWEET"]: # # First element of the text will be considered as AUTHOR OF THE TWEET or NEAR AUTHOR OF THE TWEET # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> </E>") # # print(tokenized_tweet_with_masked_chunk) # # print(bert_model_input_text) # # exit() # else: # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> " + chunk + " </E>") # list_to_print = [tweet, bert_model_input_text, chunk, str(prediction_scores[subtask][instance_id]), str(predicted_labels[subtask][instance_id]), str(test_subtasks_data[subtask][instance_id][-1]), str(test_subtasks_data[subtask][instance_id][-2])] # logging.info("\t".join(list_to_print)) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) if args.retrain: logging.info("Creating and training the model from 'bert-base-cased' ") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained('bert-base-cased') config = BertConfig.from_pretrained('bert-base-cased') config.subtasks = subtasks_list # print(config) model = MultiTaskBertForCovidEntityClassification.from_pretrained( 'bert-base-cased', config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } # new_special_tokens_dict = {"additional_special_tokens": ["<E>", "</E>"]} tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # print(model.state_dict().keys()) # TODO save and load the subtask classifier weights separately # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) # print(model.config) # exit() model.to(device) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() test_data = data logging.info("Test Data:") total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics( test_data, model.subtasks) logging.info("\n") # model_config["train_data"] = {"size":total_train_size, "pos":pos_subtasks_train_size, "neg":neg_subtasks_train_size} # model_config["dev_data"] = {"size":total_dev_size, "pos":pos_subtasks_dev_size, "neg":neg_subtasks_dev_size} model_config["test_data"] = { "size": total_test_size, "pos": pos_subtasks_test_size, "neg": neg_subtasks_test_size } # Extract subtasks data for dev and test #dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset # train_dataset = COVID19TaskDataset(train_data) # dev_dataset = COVID19TaskDataset(dev_data) test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id) # train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) # dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task, True) #dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset(dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} # Evaluate on Test logging.info("Testing on test dataset") # test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task) predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( test_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"Testing the trained classifier on subtask: {subtask}") # print(len(test_dataloader)) # print(len(prediction_scores[subtask])) # print(len(test_subtasks_data[subtask])) results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(test_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( test_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( test_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask]) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # # Top predictions in the Test case # prediction_scores[subtask] = np.array(prediction_scores[subtask]) # sorted_prediction_ids = np.argsort(-prediction_scores[subtask]) # K = 200 # logging.info("Top {} predictions:".format(K)) # logging.info("\t".join(["Tweet", "BERT model input", "candidate chunk", "prediction score", "predicted label", "gold label", "gold chunks"])) # for i in range(K): # instance_id = sorted_prediction_ids[i] # # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label # tweet = test_subtasks_data[subtask][instance_id][0].replace("\n", " ") # chunk = test_subtasks_data[subtask][instance_id][1] # tokenized_tweet_with_masked_chunk = test_subtasks_data[subtask][instance_id][6] # if chunk in ["AUTHOR OF THE TWEET", "NEAR AUTHOR OF THE TWEET"]: # # First element of the text will be considered as AUTHOR OF THE TWEET or NEAR AUTHOR OF THE TWEET # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> </E>") # # print(tokenized_tweet_with_masked_chunk) # # print(bert_model_input_text) # # exit() # else: # bert_model_input_text = tokenized_tweet_with_masked_chunk.replace(Q_TOKEN, "<E> " + chunk + " </E>") # list_to_print = [tweet, bert_model_input_text, chunk, str(prediction_scores[subtask][instance_id]), str(predicted_labels[subtask][instance_id]), str(test_subtasks_data[subtask][instance_id][-1]), str(test_subtasks_data[subtask][instance_id][-2])] # logging.info("\t".join(list_to_print)) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
def main(): task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data = extract_instances_for_current_subtask(task_instances_dict, args.sub_task) logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data, test_data = split_instances_in_train_dev_test(data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_train_size, neg_train_size = log_data_statistics( train_data) logging.info("Dev Data:") total_dev_size, pos_dev_size, neg_dev_size = log_data_statistics(dev_data) logging.info("Test Data:") total_test_size, pos_test_size, neg_test_size = log_data_statistics( test_data) logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_train_size, "neg": neg_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_dev_size, "neg": neg_dev_size } model_config["test_data"] = { "size": total_test_size, "pos": pos_test_size, "neg": neg_test_size } # Extract n-gram features from the train data # Returned ngrams will be dict of dict # TODO: update the feature extractor feature2i, i2feature = create_ngram_features_from(train_data) logging.info( f"Total number of features extracted from train = {len(feature2i)}, {len(i2feature)}" ) model_config["features"] = {"size": len(feature2i)} # Extract Feature vectors and labels from train and test data train_X, train_Y = convert_data_to_feature_vector_and_labels( train_data, feature2i) dev_X, dev_Y = convert_data_to_feature_vector_and_labels( dev_data, feature2i) test_X, test_Y = convert_data_to_feature_vector_and_labels( test_data, feature2i) logging.info( f"Train Data Features = {train_X.shape} and Labels = {len(train_Y)}") logging.info( f"Dev Data Features = {dev_X.shape} and Labels = {len(dev_Y)}") logging.info( f"Test Data Features = {test_X.shape} and Labels = {len(test_Y)}") model_config["train_data"]["features_shape"] = train_X.shape model_config["train_data"]["labels_shape"] = len(train_Y) model_config["dev_data"]["features_shape"] = dev_X.shape model_config["dev_data"]["labels_shape"] = len(dev_Y) model_config["test_data"]["features_shape"] = test_X.shape model_config["test_data"]["labels_shape"] = len(test_Y) # Train logistic regression classifier logging.info("Training the Logistic Regression classifier") lr = LogisticRegression(solver='lbfgs') lr.fit(train_X, train_Y) model_config["model"] = "LogisticRegression(solver='lbfgs')" # Find best threshold based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] dev_prediction_probs = lr.predict_proba(dev_X)[:, 1] dev_t_F1_P_Rs = list() best_threshold_based_on_F1 = 0.5 best_dev_F1 = 0.0 for t in thresholds: dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_data, dev_prediction_probs, THRESHOLD=t) dev_t_F1_P_Rs.append( (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) if dev_F1 > best_dev_F1: best_threshold_based_on_F1 = t best_dev_F1 = dev_F1 log_list(dev_t_F1_P_Rs) logging.info( f"Best Threshold: {best_threshold_based_on_F1}\t Best dev F1: {best_dev_F1}" ) # Save the best dev threshold and dev_F1 in results dict results["best_dev_threshold"] = best_threshold_based_on_F1 results["best_dev_F1"] = best_dev_F1 results["dev_t_F1_P_Rs"] = dev_t_F1_P_Rs # y_pred = (clf.predict_proba(X_test)[:,1] >= 0.3).astype(bool) # Test logging.info("Testing the trained classifier") predictions = lr.predict(test_X) probs = lr.predict_proba(test_X) test_Y_prediction_probs = probs[:, 1] cm = metrics.confusion_matrix(test_Y, predictions) classification_report = metrics.classification_report(test_Y, predictions, output_dict=True) logging.info(cm) logging.info(metrics.classification_report(test_Y, predictions)) results["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(test_data, test_Y_prediction_probs) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results["SQuAD_EM"] = EM_score results["SQuAD_F1"] = F1_score results["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( test_data, test_Y_prediction_probs, positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results["SQuAD_Pos. EM"] = pos_EM_score results["SQuAD_Pos. F1"] = pos_F1_score results["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN(test_data, test_Y_prediction_probs, THRESHOLD=best_threshold_based_on_F1) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results["F1"] = F1 results["P"] = P results["R"] = R results["TP"] = TP results["FP"] = FP results["FN"] = FN N = TP + FN results["N"] = N # Top predictions in the Test case sorted_prediction_ids = np.argsort(-test_Y_prediction_probs) K = 30 logging.info("Top {} predictions:".format(K)) for i in range(K): instance_id = sorted_prediction_ids[i] # text :: candidate_chunk :: candidate_chunk_id :: chunk_start_text_id :: chunk_end_text_id :: tokenized_tweet :: tokenized_tweet_with_masked_q_token :: tagged_chunks :: question_label list_to_print = [ test_data[instance_id][0], test_data[instance_id][6], test_data[instance_id][1], str(test_Y_prediction_probs[instance_id]), str(test_Y[instance_id]), str(test_data[instance_id][-1]), str(test_data[instance_id][-2]) ] logging.info("\t".join(list_to_print)) # Top feature analysis coefs = lr.coef_[0] K = 10 sorted_feature_ids = np.argsort(-coefs) logging.info("Top {} features:".format(K)) for i in range(K): feature_id = sorted_feature_ids[i] logging.info(f"{i2feature[feature_id]}\t{coefs[feature_id]}") # Plot the precision recall curve save_figure_file = os.path.join(args.output_dir, "Precision Recall Curve.png") logging.info(f"Saving precision recall curve at {save_figure_file}") disp = plot_precision_recall_curve(lr, test_X, test_Y) disp.ax_.set_title('2-class Precision-Recall curve') disp.ax_.figure.savefig(save_figure_file) # Save the model and features in pickle file model_and_features_save_file = os.path.join(args.output_dir, "model_and_features.pkl") logging.info( f"Saving LR model and features at {model_and_features_save_file}") save_in_pickle((lr, feature2i, i2feature), model_and_features_save_file) # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)
def __call__(self, features, start_logits, end_logits, token, f): #这里需要把问题id相同的语句合并起来计算 problem_features = {} start_problem_logit = {} end_problem_logit = {} for num, (feature, start_logit, end_logit) in enumerate( zip(features, start_logits, end_logits)): if feature.id not in problem_features.keys(): problem_features[feature.id] = [feature] start_problem_logit[feature.id] = [start_logit] end_problem_logit[feature.id] = [end_logit] else: problem_features[feature.id].append(feature) start_problem_logit[feature.id].append(start_logit) end_problem_logit[feature.id].append(end_logit) for key in tqdm(problem_features.keys()): features = problem_features[key] start_logits = start_problem_logit[key] end_logits = end_problem_logit[key] answer_text = features[0].answer context = features[0].context score_null = 100000 prelim_predictions = [] #这里根据原文注释,其实是追踪start+end最小值 min_null_score_index = 0 #这里用来计算null最小的分段位置 null_start_logit = 0 null_end_logit = 0 #这里的作用其实是记录最小的start和end位置 for index, (feature, start_logit, end_logit) in enumerate( list(zip(features, start_logits, end_logits))): start_index = get_nbest(start_logit, 20) end_index = get_nbest(end_logit, 20) for start in start_index: for end in end_index: if start >= len(feature.token): continue if end >= len(feature.token): continue if start not in feature.token_to_orgs: #这两个位置讲解的其实是无法从现有位置映射到原有位置 continue if end not in feature.token_to_orgs: continue if not feature.is_max_context.get( start, False): #这里是检查搜索最小min,的时候是否又出现。这里源代码真的非常谨慎了 continue if start > end: continue prelim_predictions.append([ index, start, end, start_logit[start], end_logit[end] ]) #这里其实是把这个句子中所有信息汇聚起来为接下来计算做准备 prelim_predictions = sorted(prelim_predictions, key=lambda x: (x[3] + x[4]), reverse=True) seen_predictions = {} n_best = [] for each in prelim_predictions: if len(n_best) >= 20: break if each[1] > 0: feature = features[each[0]] org_doc_start = feature.token_to_orgs[each[1]] orig_doc_end = feature.token_to_orgs[each[2]] tok_text = feature.token[each[1]:each[2] + 1] orig_tokens = context[org_doc_start:orig_doc_end + 1] tok_text = token.convert_tokens_to_string(tok_text) orig_text = token.convert_tokens_to_string(orig_tokens) tok_text = " ".join(tok_text.lower().strip().split()) orig_text = " ".join(orig_text.lower().strip().split()) final_text = get_final_text(tok_text, orig_text, True, False, token) #这个函数我还没来得及看,先用原本的用着 if final_text in seen_predictions: continue seen_predictions[final_text] = True #这里我的理解是用来解决重复出现 else: final_text = "" seen_predictions[""] = True n_best.append([final_text, each[3], each[4]]) if not n_best: n_best.append(["empty", 0.0, 0.0]) total_score = [] best_non_null_entry = None for entry in n_best: total_score.append(entry[1] + entry[2]) if not best_non_null_entry and entry[0]: best_non_null_entry = entry probs = _compute_softmax(total_score) nbest_json = [] predictions = [] f1 = 0 excat = 0 predictions = n_best[0][0] f1, excat = get_raw_scores(answer_text, predictions, token) f.write("answer:" + str(answer_text) + "\n") f.write("predictions:" + str(predictions) + "\n") f.write("f1:" + str(f1) + "\n") f.write("extract:" + str(excat) + "\n") self.f1.append(f1) self.extract.append(excat)
def write_predictions(all_examples, all_features, all_results, n_best_size, output_prediction_file, output_nbest_file, output_null_log_odds_file, orig_data, config): print("writting predicttion...") # print("Writing predictions to: %s" % (output_prediction_file)) example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] cur_null_score = result.cls_logits # if we could have irrelevant answers, get the min score of irrelevant score_null = min(score_null, cur_null_score) for i in range(config.start_n_top): for j in range(config.end_n_top): start_log_prob = result.start_top_log_probs[i] start_index = result.start_top_index[i] j_index = i * config.end_n_top + j end_log_prob = result.end_top_log_probs[j_index] end_index = result.end_top_index[j_index] # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= feature.paragraph_len - 1: continue if end_index >= feature.paragraph_len - 1: continue if not feature.token_is_max_context.get( start_index, False): continue if end_index < start_index: continue prelim_predictions.append( _PrelimPrediction(feature_index=feature_index, start_index=start_index, end_index=end_index, start_log_prob=start_log_prob, end_log_prob=end_log_prob)) prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_log_prob + x.end_log_prob), reverse=True) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] tok_start_to_orig_index = feature.tok_start_to_orig_index tok_end_to_orig_index = feature.tok_end_to_orig_index start_orig_pos = tok_start_to_orig_index[pred.start_index] end_orig_pos = tok_end_to_orig_index[pred.end_index] paragraph_text = example.paragraph_text final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip() if final_text in seen_predictions: continue seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, start_log_prob=pred.start_log_prob, end_log_prob=pred.end_log_prob)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( _NbestPrediction(text="", start_log_prob=-1e6, end_log_prob=-1e6)) total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_log_prob + entry.end_log_prob) if not best_non_null_entry: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_log_prob"] = entry.start_log_prob output["end_log_prob"] = entry.end_log_prob nbest_json.append(output) assert len(nbest_json) >= 1 assert best_non_null_entry is not None score_diff = score_null scores_diff_json[example.qas_id] = score_diff # note(zhiliny): always predict best_non_null_entry # and the evaluation script will search for the best threshold all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json with tf.io.gfile.GFile(output_prediction_file, "w") as writer: writer.write(json.dumps(all_predictions, indent=4) + "\n") with tf.io.gfile.GFile(output_nbest_file, "w") as writer: writer.write(json.dumps(all_nbest_json, indent=4) + "\n") with tf.io.gfile.GFile(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") qid_to_has_ans = utils.make_qid_to_has_ans(orig_data) exact_raw, f1_raw = utils.get_raw_scores(orig_data, all_predictions) out_eval = {} utils.find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) return out_eval
def main(): # Read all the data instances task_instances_dict, tag_statistics, question_keys_and_tags = load_from_pickle( args.data_file) data, subtasks_list = get_multitask_instances_for_valid_tasks( task_instances_dict, tag_statistics) data = add_marker_for_loss_ignore( data, 1.0 if args.loss_for_no_consensus else 0.0) if args.retrain: if args.large_bert: model_name = "bert-large-cased" elif args.covid_bert: model_name = "digitalepidemiologylab/covid-twitter-bert" else: model_name = "bert-base-cased" logging.info("Creating and training the model from '" + model_name + "'") # Create the save_directory if not exists make_dir_if_not_exists(args.save_directory) # Initialize tokenizer and model with pretrained weights tokenizer = BertTokenizer.from_pretrained(model_name) config = BertConfig.from_pretrained(model_name) config.subtasks = subtasks_list model = MultiTaskBertForCovidEntityClassification.from_pretrained( model_name, config=config) # Add new tokens in tokenizer new_special_tokens_dict = { "additional_special_tokens": ["<E>", "</E>", "<URL>", "@USER"] } tokenizer.add_special_tokens(new_special_tokens_dict) # Add the new embeddings in the weights print("Embeddings type:", model.bert.embeddings.word_embeddings.weight.data.type()) print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) embedding_size = model.bert.embeddings.word_embeddings.weight.size(1) new_embeddings = torch.FloatTensor( len(new_special_tokens_dict["additional_special_tokens"]), embedding_size).uniform_(-0.1, 0.1) # new_embeddings = torch.FloatTensor(2, embedding_size).uniform_(-0.1, 0.1) print("new_embeddings shape:", new_embeddings.size()) new_embedding_weight = torch.cat( (model.bert.embeddings.word_embeddings.weight.data, new_embeddings), 0) model.bert.embeddings.word_embeddings.weight.data = new_embedding_weight print("Embeddings shape:", model.bert.embeddings.word_embeddings.weight.data.size()) # Update model config vocab size model.config.vocab_size = model.config.vocab_size + len( new_special_tokens_dict["additional_special_tokens"]) else: # Load the tokenizer and model from the save_directory tokenizer = BertTokenizer.from_pretrained(args.save_directory) model = MultiTaskBertForCovidEntityClassification.from_pretrained( args.save_directory) # Load from individual state dicts for subtask in model.subtasks: model.classifiers[subtask].load_state_dict( torch.load( os.path.join(args.save_directory, f"{subtask}_classifier.bin"))) model.to(device) if args.wandb: wandb.watch(model) # Explicitly move the classifiers to device for subtask, classifier in model.classifiers.items(): classifier.to(device) for subtask, classifier in model.context_vectors.items(): classifier.to(device) entity_start_token_id = tokenizer.convert_tokens_to_ids(["<E>"])[0] entity_end_token_id = tokenizer.convert_tokens_to_ids(["</E>"])[0] logging.info( f"Task dataset for task: {args.task} loaded from {args.data_file}.") model_config = dict() results = dict() # Split the data into train, dev and test and shuffle the train segment train_data, dev_data = split_multitask_instances_in_train_dev(data) random.shuffle(train_data) # shuffle happens in-place logging.info("Train Data:") total_train_size, pos_subtasks_train_size, neg_subtasks_train_size = log_multitask_data_statistics( train_data, model.subtasks) logging.info("Dev Data:") total_dev_size, pos_subtasks_dev_size, neg_subtasks_dev_size = log_multitask_data_statistics( dev_data, model.subtasks) #logging.info("Test Data:") #total_test_size, pos_subtasks_test_size, neg_subtasks_test_size = log_multitask_data_statistics(test_data, model.subtasks) logging.info("\n") model_config["train_data"] = { "size": total_train_size, "pos": pos_subtasks_train_size, "neg": neg_subtasks_train_size } model_config["dev_data"] = { "size": total_dev_size, "pos": pos_subtasks_dev_size, "neg": neg_subtasks_dev_size } #model_config["test_data"] = {"size":total_test_size, "pos":pos_subtasks_test_size, "neg":neg_subtasks_test_size} # Extract subtasks data for dev and test train_subtasks_data = split_data_based_on_subtasks(train_data, model.subtasks) dev_subtasks_data = split_data_based_on_subtasks(dev_data, model.subtasks) #test_subtasks_data = split_data_based_on_subtasks(test_data, model.subtasks) # Load the instances into pytorch dataset train_dataset = COVID19TaskDataset(train_data) dev_dataset = COVID19TaskDataset(dev_data) #test_dataset = COVID19TaskDataset(test_data) logging.info("Loaded the datasets into Pytorch datasets") tokenize_collator = TokenizeCollator(tokenizer, model.subtasks, entity_start_token_id, entity_end_token_id) train_dataloader = DataLoader(train_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=True, num_workers=0, collate_fn=tokenize_collator) dev_dataloader = DataLoader(dev_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) #test_dataloader = DataLoader(test_dataset, batch_size=POSSIBLE_BATCH_SIZE, shuffle=False, num_workers=0, collate_fn=tokenize_collator) logging.info("Created train and test dataloaders with batch aggregation") # Only retrain if needed if args.retrain: optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) logging.info("Created model optimizer") #if args.sentence_level_classify: # args.n_epochs += 2 epochs = args.n_epochs # Total number of training steps is [number of batches] x [number of epochs]. total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. # NOTE: num_warmup_steps = 0 is the Default value in run_glue.py scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # We'll store a number of quantities such as training and validation loss, validation accuracy, and timings. training_stats = [] print("\n\n\n ====== Training for task", args.task, "=============\n\n\n") logging.info(f"Initiating training loop for {args.n_epochs} epochs...") print(model.state_dict().keys()) total_start_time = time.time() # Find the accumulation steps accumulation_steps = args.batch_size / POSSIBLE_BATCH_SIZE # Dev validation trajectory epoch_train_loss = list() train_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } dev_subtasks_validation_statistics = { subtask: list() for subtask in model.subtasks } best_dev_F1 = 0 for epoch in range(epochs): logging.info(f"Initiating Epoch {epoch+1}:") # Reset the total loss for each epoch. total_train_loss = 0 train_loss_trajectory = list() # Reset timer for each epoch start_time = time.time() model.train() dev_log_frequency = 5 n_steps = len(train_dataloader) dev_steps = int(n_steps / dev_log_frequency) for step, batch in enumerate(train_dataloader): # Upload labels of each subtask to device for subtask in model.subtasks: subtask_labels = batch["gold_labels"][subtask] subtask_labels = subtask_labels.to(device) batch["gold_labels"][subtask] = subtask_labels batch["label_ignore_loss"][subtask] = batch[ "label_ignore_loss"][subtask].to(device) # Forward input_dict = { "input_ids": batch["input_ids"].to(device), "entity_start_positions": batch["entity_start_positions"].to(device), "entity_end_positions": batch["entity_end_positions"].to(device), "labels": batch["gold_labels"], "label_weight": batch["label_ignore_loss"] } input_ids = batch["input_ids"] entity_start_positions = batch["entity_start_positions"] gold_labels = batch["gold_labels"] batch_data = batch["batch_data"] loss, logits = model(**input_dict) # Accumulate loss total_train_loss += loss.item() # Backward: compute gradients loss.backward() if (step + 1) % accumulation_steps == 0: # Calculate elapsed time in minutes and print loss on the tqdm bar elapsed = format_time(time.time() - start_time) avg_train_loss = total_train_loss / (step + 1) # keep track of changing avg_train_loss train_loss_trajectory.append(avg_train_loss) if (step + 1) % (accumulation_steps * 20) == 0: print( f"Epoch:{epoch+1}|Batch:{step}/{len(train_dataloader)}|Time:{elapsed}|Avg. Loss:{avg_train_loss:.4f}|Loss:{loss.item():.4f}" ) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # Clean the model's previous gradients model.zero_grad() scheduler.step() # Calculate the average loss over all of the batches. avg_train_loss = total_train_loss / len(train_dataloader) # Perform validation with the model and log the performance print("\n") logging.info("Running Validation...") # Put the model in evaluation mode--the dropout layers behave differently during evaluation. model.eval() dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) wandb_log_dict = {"Train Loss": avg_train_loss} print("Dev Set:") collect_TP_FP_FN = {"TP": 0, "FP": 0, "FN": 0} for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, task=subtask) if subtask not in IGNORE_TASKS: collect_TP_FP_FN["TP"] += dev_TP collect_TP_FP_FN["FP"] += dev_FP collect_TP_FP_FN["FN"] += dev_FN else: print("IGNORE: ", end="") print( f"Subtask:{subtask:>15}\tN={dev_TP + dev_FN}\tF1={dev_F1}\tP={dev_P}\tR={dev_R}\tTP={dev_TP}\tFP={dev_FP}\tFN={dev_FN}" ) dev_subtasks_validation_statistics[subtask].append( (epoch + 1, step + 1, dev_TP + dev_FN, dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN)) wandb_log_dict["Dev_ " + subtask + "_F1"] = dev_F1 wandb_log_dict["Dev_ " + subtask + "_P"] = dev_P wandb_log_dict["Dev_ " + subtask + "_R"] = dev_R dev_macro_P = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] + collect_TP_FP_FN["FP"]) dev_macro_R = collect_TP_FP_FN["TP"] / (collect_TP_FP_FN["TP"] + collect_TP_FP_FN["FN"]) dev_macro_F1 = (2 * dev_macro_P * dev_macro_R) / (dev_macro_P + dev_macro_R) print(collect_TP_FP_FN) print("dev_macro_P:", dev_macro_P, "\ndev_macro_R:", dev_macro_R, "\ndev_macro_F1:", dev_macro_F1, "\n") wandb_log_dict["Dev_macro_F1"] = dev_macro_F1 wandb_log_dict["Dev_macro_P"] = dev_macro_P wandb_log_dict["Dev_macro_R"] = dev_macro_R if args.wandb: wandb.log(wandb_log_dict) if dev_macro_F1 > best_dev_F1: best_dev_F1 = dev_macro_F1 print("NEW BEST F1:", best_dev_F1, " Saving checkpoint now.") torch.save(model.state_dict(), args.output_dir + "/ckpt.pth") #print(model.state_dict().keys()) #model.save_pretrained(args.save_directory) model.train() training_time = format_time(time.time() - start_time) # Record all statistics from this epoch. training_stats.append({ 'epoch': epoch + 1, 'Training Loss': avg_train_loss, 'Training Time': training_time }) # Save the loss trajectory epoch_train_loss.append(train_loss_trajectory) print("\n\n") logging.info( f"Training complete with total Train time:{format_time(time.time()- total_start_time)}" ) log_list(training_stats) model.load_state_dict(torch.load(args.output_dir + "/ckpt.pth")) model.eval() # Save the model and the Tokenizer here: #logging.info(f"Saving the model and tokenizer in {args.save_directory}") #model.save_pretrained(args.save_directory) # Save each subtask classifiers weights to individual state dicts #for subtask, classifier in model.classifiers.items(): # classifier_save_file = os.path.join(args.save_directory, f"{subtask}_classifier.bin") # logging.info(f"Saving the model's {subtask} classifier weights at {classifier_save_file}") # torch.save(classifier.state_dict(), classifier_save_file) #tokenizer.save_pretrained(args.save_directory) # Plot the train loss trajectory in a plot #train_loss_trajectory_plot_file = os.path.join(args.output_dir, "train_loss_trajectory.png") #logging.info(f"Saving the Train loss trajectory at {train_loss_trajectory_plot_file}") #print(epoch_train_loss) # TODO: Plot the validation performance # Save dev_subtasks_validation_statistics else: raise logging.info("No training needed. Directly going to evaluation!") # Save the model name in the model_config file model_config["model"] = "MultiTaskBertForCovidEntityClassification" model_config["epochs"] = args.n_epochs # Find best threshold for each subtask based on dev set performance thresholds = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] #test_predicted_labels, test_prediction_scores, test_gold_labels = make_predictions_on_dataset(test_dataloader, model, device, args.task, True) dev_predicted_labels, dev_prediction_scores, dev_gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task + "_dev", True) best_test_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_dev_thresholds = {subtask: 0.5 for subtask in model.subtasks} best_test_F1s = {subtask: 0.0 for subtask in model.subtasks} best_dev_F1s = {subtask: 0.0 for subtask in model.subtasks} #test_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} dev_subtasks_t_F1_P_Rs = {subtask: list() for subtask in model.subtasks} for subtask in model.subtasks: dev_subtask_data = dev_subtasks_data[subtask] dev_subtask_prediction_scores = dev_prediction_scores[subtask] for t in thresholds: dev_F1, dev_P, dev_R, dev_TP, dev_FP, dev_FN = get_TP_FP_FN( dev_subtask_data, dev_subtask_prediction_scores, THRESHOLD=t, task=subtask) dev_subtasks_t_F1_P_Rs[subtask].append( (t, dev_F1, dev_P, dev_R, dev_TP + dev_FN, dev_TP, dev_FP, dev_FN)) if dev_F1 > best_dev_F1s[subtask]: best_dev_thresholds[subtask] = t best_dev_F1s[subtask] = dev_F1 logging.info(f"Subtask:{subtask:>15}") log_list(dev_subtasks_t_F1_P_Rs[subtask]) logging.info( f"Best Dev Threshold for subtask: {best_dev_thresholds[subtask]}\t Best dev F1: {best_dev_F1s[subtask]}" ) # Save the best dev threshold and dev_F1 in results dict results["best_dev_threshold"] = best_dev_thresholds results["best_dev_F1s"] = best_dev_F1s results["dev_t_F1_P_Rs"] = dev_subtasks_t_F1_P_Rs # Evaluate on Test logging.info("Testing on eval dataset") predicted_labels, prediction_scores, gold_labels = make_predictions_on_dataset( dev_dataloader, model, device, args.task) # Test for subtask in model.subtasks: logging.info(f"\nTesting the trained classifier on subtask: {subtask}") results[subtask] = dict() cm = metrics.confusion_matrix(gold_labels[subtask], predicted_labels[subtask]) classification_report = metrics.classification_report( gold_labels[subtask], predicted_labels[subtask], output_dict=True) logging.info(cm) logging.info( metrics.classification_report(gold_labels[subtask], predicted_labels[subtask])) results[subtask]["CM"] = cm.tolist( ) # Storing it as list of lists instead of numpy.ndarray results[subtask]["Classification Report"] = classification_report # SQuAD style EM and F1 evaluation for all test cases and for positive test cases (i.e. for cases where annotators had a gold annotation) EM_score, F1_score, total = get_raw_scores(dev_subtasks_data[subtask], prediction_scores[subtask]) logging.info("Word overlap based SQuAD evaluation style metrics:") logging.info(f"Total number of cases: {total}") logging.info(f"EM_score: {EM_score}") logging.info(f"F1_score: {F1_score}") results[subtask]["SQuAD_EM"] = EM_score results[subtask]["SQuAD_F1"] = F1_score results[subtask]["SQuAD_total"] = total pos_EM_score, pos_F1_score, pos_total = get_raw_scores( dev_subtasks_data[subtask], prediction_scores[subtask], positive_only=True) logging.info(f"Total number of Positive cases: {pos_total}") logging.info(f"Pos. EM_score: {pos_EM_score}") logging.info(f"Pos. F1_score: {pos_F1_score}") results[subtask]["SQuAD_Pos. EM"] = pos_EM_score results[subtask]["SQuAD_Pos. F1"] = pos_F1_score results[subtask]["SQuAD_Pos. EM_F1_total"] = pos_total # New evaluation suggested by Alan F1, P, R, TP, FP, FN = get_TP_FP_FN( dev_subtasks_data[subtask], prediction_scores[subtask], THRESHOLD=best_dev_thresholds[subtask], task=subtask) logging.info("New evaluation scores:") logging.info(f"F1: {F1}") logging.info(f"Precision: {P}") logging.info(f"Recall: {R}") logging.info(f"True Positive: {TP}") logging.info(f"False Positive: {FP}") logging.info(f"False Negative: {FN}") results[subtask]["F1"] = F1 results[subtask]["P"] = P results[subtask]["R"] = R results[subtask]["TP"] = TP results[subtask]["FP"] = FP results[subtask]["FN"] = FN N = TP + FN results[subtask]["N"] = N # Save model_config and results model_config_file = os.path.join(args.output_dir, "model_config.json") results_file = os.path.join(args.output_dir, "results.json") logging.info(f"Saving model config at {model_config_file}") save_in_json(model_config, model_config_file) logging.info(f"Saving results at {results_file}") save_in_json(results, results_file)