def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def evaluate(model, data_loader, device, eval_file): nll_meter = util.AverageMeter() model.eval() pred_dict = dict() with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: model.module.set_eval_data(gold_dict) # pass eval_data as model state for cw_idxs, cc_idxs, \ qw_idxs, qc_idxs, \ start_idxs, end_idxs, \ counts, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) start_idxs = start_idxs.to(device) end_idxs = end_idxs.to(device) counts = counts.to(device) ids = ids.to(device) batch_size = cw_idxs.size(0) # Forward output_dict = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs, ids, start_idxs, end_idxs, counts) loss = output_dict['loss'] nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) pred_dict.update(output_dict["predictions"] ) # Errato ogni volta penso sovrascriva model.module.set_eval_data(None) model.train() eval_dict = eval_dicts(gold_dict, pred_dict) results_list = [('Loss', nll_meter.avg), ('F1', eval_dict['F1']), ('EM', eval_dict['EM'])] eval_dict = OrderedDict(results_list) return eval_dict, pred_dict
def evaluate_bert(args): eval_file_path: str = args.eval_file_path with open(eval_file_path, 'r') as fh: gold_dict = invert_golden(json_load(fh)) result_file_path: str = args.result_file_path with open(result_file_path, 'r') as fh: pred_dict = json_load(fh) use_squad_v2: bool = True results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) if args.write_predictions_csv: path, filename = os.path.split(eval_file_path) filename = ".".join(filename.split(".")[:-1]) sub_path = os.path.join(path, filename) + ".csv" print("Saving results to: %s" % sub_path) with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(pred_dict): csv_writer.writerow([uuid, pred_dict[uuid]]) return results
def test_model(): args = get_setup_args() word_vectors = util.torch_from_json(args.word_emb_file) with open(args.char2idx_file, "r") as f: char2idx = json_load(f) model = QANet(word_vectors, char2idx) cw_idxs = torch.randint(2, 1000, (64, 374)) cc_idxs = torch.randint(2, 50, (64, 374, 200)) qw_idxs = torch.randint(2, 1000, (64, 70)) qc_idxs = torch.randint(2, 50, (64, 70, 200)) cw_idxs[:, 0] = 1 cw_idxs[3, -1] = 0 qw_idxs[:, 0] = 1 qw_idxs[3, -1] = 0 out = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) print(out)
def inference(model): # Get data loader record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate nll_meter = util.AverageMeter() # pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) # pred_dict.update(idx2pred) sub_dict.update(uuid2pred) return sub_dict
def test_input_embedding(): args = get_setup_args() d_char = 200 word_dropout = 0.1 char_dropout = 0.05 with open(args.char2idx_file, "r") as f: char2idx = json_load(f) hidden_size = 500 highway_dropout = 0.1 word_vectors = util.torch_from_json(args.word_emb_file) input_embedding = InputEmbedding(word_vectors, d_char, char2idx, hidden_size, word_dropout, char_dropout, highway_dropout) word_inputs = torch.tensor([[1, 2, 0], [1, 2, 4]], dtype=torch.long) char_inputs = torch.tensor([[[1, 2, 2, 0], [1, 3, 2, 3], [0, 0, 0, 0]], [[1, 5, 2, 0], [1, 3, 6, 3], [3, 4, 2, 1]]], dtype=torch.long) emb = input_embedding(word_inputs, char_inputs) pickle_in = open('input_emb.pickle', 'wb') pickle.dump(emb, pickle_in) assert emb.size() == (2, 3, 500) return emb
def evaluate(model, data_loader, device, eval_file): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cw_dep_paths, cc_idxs, qw_idxs, qw_dep_paths, qc_idxs, y1, y2, ids in data_loader: # Setup for forward batch_size = 1 # TODO: # Forward outputs = model(featues) y = y.to(device) loss = loss_fn(outputs, y) # TODO nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] results = OrderedDict(results_list) return results, pred_dict
from ujson import dump from ujson import load as json_load import numpy as np with open('data/word2idx.json', 'r') as fh: word2idx = json_load(fh) with open('data/char2idx.json', 'r') as fh: char2idx = json_load(fh) with open('data/char_emb.json', 'r') as fh: char_emb = json_load(fh) char_emb_for_word = {} for word in word2idx: if word not in char_emb_for_word: wid = word2idx[word] emb = None for c in word: if emb is None: emb = np.array(char_emb[char2idx[c]]) else: emb += np.array(char_emb[char2idx[c]]) char_emb_for_word[int(wid)] = emb with open("data/char_emb_for_word.json", "w") as ujson_file: dump(char_emb_for_word, ujson_file)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # print("*"*80) # print(len(dataset.question_idxs)) # for question_idx in dataset.question_idxs: # print(question_idx) # print("*" * 80) # print(self.question_idxs[question_idx]) # self.question_idxs[idx] # print("data_loader: ",data_loader) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) # create statistics # print("*"*80) # print(len(gold_dict)) # print(gold_dict['1']['question']) count_questions_type = defaultdict(lambda: 0) audit_trail_from_question_type = defaultdict(lambda: []) list_of_interrogative_pronouns = [ "what", "whose", "why", "which", "where", "when", "how", "who", "whom" ] for index in range(1, len(gold_dict)): # transform the question in lower case to simplify the analysis, thus losing the benefit of the capital letters # possibly indicating the position of the interrogative pronoun in the sentence. question_lower_case = gold_dict[str(index)]['question'].lower() list_question_lower_case_with_punctuation = question_lower_case.translate( {ord(i): " " for i in "'"}).split() # question_lower_case = [] for item in list_question_lower_case_with_punctuation: question_lower_case.append( item.translate({ord(i): "" for i in ",.<>!@£$%^&*()_-+=?"})) # defining a variable for the first word first_word_question_lower_case = question_lower_case[0] # defining variable for the second word second_word_question_lower_case = question_lower_case[1] # defining variable for the first and second word combined_first_and_second_words = first_word_question_lower_case + " " + second_word_question_lower_case #printing on the screen test for debugging purpose # Analyzing the sentence if first_word_question_lower_case in list_of_interrogative_pronouns: count_questions_type[first_word_question_lower_case] += 1 audit_trail_from_question_type[ first_word_question_lower_case].append(str(index)) # composed question starting by in elif first_word_question_lower_case == "in": if second_word_question_lower_case in list_of_interrogative_pronouns and second_word_question_lower_case != "whose": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # composed question starting by by elif first_word_question_lower_case == "by": if second_word_question_lower_case in list_of_interrogative_pronouns \ and second_word_question_lower_case !="whom"\ and second_word_question_lower_case !="which"\ and second_word_question_lower_case !="when"\ and second_word_question_lower_case !="how": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) #if pronoun =="": # print(">>", question_lower_case) # print("@@@", gold_dict[str(index)]['question']) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # if pronoun =="": # print(">>", question_lower_case.split()) # print() #if first_word_question_lower_case == "if": # print(">>", question_lower_case.split()) # print(count_questions_type) # if gold_dict[str(index)]['question'].lower().split()[0] == "in": # print(gold_dict[str(index)]['question']) reverse_dict_by_value = OrderedDict( sorted(count_questions_type.items(), key=lambda x: x[1])) # print(count_questions_type) total_questions = sum(count_questions_type.values()) # print(reverse_dict) #for k, v in reverse_dict_by_value.items(): # print( "%s: %s and in percentage: %s" % (k, v, 100*v/total_questions)) #print(audit_trail_from_question_type) # exit() with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if args.model == 'bidaf': log_p1, log_p2 = model(cw_idxs, qw_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) # Printing information for questions without interrogative pronouns """" print("len(gold_dict): ", len(gold_dict)) print("len(pred_dict): ", len(pred_dict)) print("Is gold_dict.keys() identical to pred_dict.keys(): ", gold_dict.keys()==pred_dict.keys()) if gold_dict.keys()!=pred_dict.keys(): for key in gold_dict.keys(): if key not in pred_dict.keys(): print("key ", key, " missing in pred_dict.keys(") """ results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Computing the F1 score for each type of question # # audit_trail_from_question_type[pronoun].append(str(index)) # create a list of the types of questions by extracting the keys from the dict audit_trail_from_question_type types_of_questions = list(audit_trail_from_question_type.keys()) gold_dict_per_type_of_questions = defaultdict(lambda: []) pred_dict_per_type_of_questions = {} gold_dict_per_type_of_questions_start = {} pred_dict_per_type_of_questions_start = {} gold_dict_per_type_of_questions_middle = {} pred_dict_per_type_of_questions_middle = {} gold_dict_per_type_of_questions_end = {} pred_dict_per_type_of_questions_end = {} for type_of_questions in types_of_questions: #gold_pred = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions]} #lst_pred = {key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions]} # Create two dictionnaries for each type of sentence for gold_dict_per_type_of_questions and pred_dict_per_type_of_questions gold_dict_per_type_of_questions[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1']) gold_dict_per_type_of_questions_start[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_start[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } gold_dict_per_type_of_questions_middle[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_middle[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } gold_dict_per_type_of_questions_end[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_end[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } for key, value in gold_dict.items(): #if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys(): if key in audit_trail_from_question_type[ type_of_questions] and type_of_questions != "" and key in pred_dict_per_type_of_questions[ type_of_questions]: """ print("type_of_questions: ",type_of_questions) print("key: ", key) print("question: ", value["question"]) sub_index = value["question"].lower().find(type_of_questions) print("sub_index: ",sub_index) test_fc = value["question"].lower().find(type_of_questions) print("present type of the var: ",type(test_fc)) #print("question: ", value["question"][str(key)]) print("length of the question: ", len(value["question"])) print('Position of the interrogative pronoun in the question:', ) """ # Create two dictionnaries for each type of sentence based at the start of the sentence if value["question"].lower().find( type_of_questions) == 1 or value["question"].lower( ).find(type_of_questions) == 0: #print("BEGINNING") if type_of_questions != "": try: del gold_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass #pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in # gold_dict_per_type_of_questions_start[ # type_of_questions].keys()} elif value["question"].lower( ).find(type_of_questions) >= len( value["question"]) - len(type_of_questions) - 5: #print("END") if type_of_questions != "": try: del gold_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass #print("type_of_questions: ",type_of_questions) #sub_index = value["question"].lower().find(type_of_questions) #print("sub_index: ", sub_index) #print("len(value['question']) - len(type_of_questions) - 2: ", len(value["question"])-len(type_of_questions)-2) #start_string = len(value["question"])-len(type_of_questions)-6 #end_string = len(value["question"])-1 #print("extract at the end: ", value["question"][start_string:end_string]) else: #print("MIDDLE") if type_of_questions != "": try: del gold_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass pass """ if type_of_questions != "": gold_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in gold_dict.items() if (key in audit_trail_from_question_type[type_of_questions] \ and (value["question"].lower().find(type_of_questions) <= 1) \ and key in pred_dict_per_type_of_questions[type_of_questions]) } """ """ for key in gold_dict_per_type_of_questions_start[type_of_questions].keys(): print("key:: ", key ) print("type(key):: ", type(key) ) print("pred_dict[,key,] : ", pred_dict[key]) print("@@@@@@@@@@@@@@@@@@@@@@@@") pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in gold_dict_per_type_of_questions_start[type_of_questions].keys()} #pred_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in pred_dict.items() if key in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) } # Create two dictionnaries for each type of sentence based at the end of the sentence gold_dict_per_type_of_questions_end[type_of_questions] = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] \ and value["question"].lower().find(type_of_questions) >= len(value["question"])-len(type_of_questions)-2 \ and key in pred_dict_per_type_of_questions[type_of_questions]} pred_dict_per_type_of_questions_end[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} #print("*"*80) # Create two dictionnaries for each type of sentence based at the middle of the sentencecount_questions_type gold_dict_per_type_of_questions_middle[type_of_questions] = {key: value for key, value in gold_dict.items() if key not in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) \ and key not in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} pred_dict_per_type_of_questions_middle[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} else: gold_dict_per_type_of_questions_start[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_start[""] = pred_dict_per_type_of_questions[""] gold_dict_per_type_of_questions_end[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_end[""] = pred_dict_per_type_of_questions[""] gold_dict_per_type_of_questions_middle[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_middle[""] = pred_dict_per_type_of_questions[""] """ positions_in_question = ["beginning", "middle", "end"] # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1']) list_beginning = [ util.eval_dicts( gold_dict_per_type_of_questions_start[type_of_questions], pred_dict_per_type_of_questions_start[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] list_middle = [ util.eval_dicts( gold_dict_per_type_of_questions_middle[type_of_questions], pred_dict_per_type_of_questions_middle[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] list_end = [ util.eval_dicts( gold_dict_per_type_of_questions_end[type_of_questions], pred_dict_per_type_of_questions_end[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] #for type_of_questions in types_of_questions: # print("gold_dict_per_type_of_questions_start[type_of_questions]: ",gold_dict_per_type_of_questions_start[type_of_questions]) # print("pred_dict_per_type_of_questions[type_of_questions]: ",pred_dict_per_type_of_questions[type_of_questions]) F1 = np.array([list_beginning, list_middle, list_end]) m, n = F1.shape value_to_ignore = [] for i in range(m): for j in range(n): if F1[i, j] == "NA" or F1[i, j] == 0: value_to_ignore.append((i, j)) print("value to ignore: ", value_to_ignore) #F1 = np.array([[0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0]]) data_label = copy.deepcopy(F1) for row in data_label: for column_idx in range(len(row)): if row[column_idx] == "NA": row[column_idx] = "" # print question without interrogative pronoun required for the second part of the analysis: for key, value in gold_dict.items(): if key in audit_trail_from_question_type[ ""] and key in pred_dict.keys(): print("question: ", gold_dict_per_type_of_questions['']) print("golden answers: ", ) print("prediction: ", pred_dict[key]) print() fig, ax = plt.subplots() types_of_questions[types_of_questions.index( "")] = "Implicit question without interrogative pronoun" im, cbar = heatmap(F1, positions_in_question, types_of_questions, ax=ax, \ cmap="YlGn", cbarlabel="F1 scores") texts = annotate_heatmap(im, data=data_label, valfmt="{x:.1f}", ignore=value_to_ignore) fig.tight_layout() plt.show() # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, model_name, gpu_ids): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, cpos_idxs, cner_idxs, cw_ems, cw_tfs, qw_idxs, qc_idxs, qpos_idxs, qner_idxs, qw_ems, qw_tfs, y1, y2, ids in data_loader: # NEW # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if 'baseline' in model_name: log_p1, log_p2 = model(cw_idxs, qw_idxs) elif model_name == 'BiDAF_char': # Additional setup for forward cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) elif (model_name == 'BiDAF_tag') or (model_name == 'BiDAF_tag_unfrozen') or (model_name == 'BiDAF_tag_loss') or (model_name == 'BiDAF_tag_unfrozen_loss'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs) elif (model_name == 'BiDAF_tag_ext') or (model_name == 'BiDAF_tag_ext_unfrozen'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) cw_ems = cw_ems.to(device) cw_tfs = cw_tfs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) qw_ems = qw_ems.to(device) qw_tfs = qw_tfs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs, cw_ems, qw_ems, cw_tfs, qw_tfs) elif args.name == 'coattn': max_c_len = cw_idxs.size(1) max_q_len = qw_idxs.size(1) c_len = [] q_len = [] for i in range(cw_idxs.size(0)): if len((cw_idxs[i] == 0).nonzero()) != 0: c_len_i = (cw_idxs[i] == 0).nonzero()[0].item() else: c_len_i = cw_idxs.size(1) if len((qw_idxs[i] == 0).nonzero()) != 0: q_len_i = (qw_idxs[i] == 0).nonzero()[0].item() else: q_len_i = qw_idxs.size(1) c_len.append(int(c_len_i)) q_len.append(int(q_len_i)) c_len = torch.Tensor(c_len).int() q_len = torch.Tensor(q_len).int() num_examples = int(cw_idxs.size(0) / len(gpu_ids)) log_p1, log_p2 = model(max_c_len, max_q_len, cw_idxs, qw_idxs, c_len, q_len, num_examples, True, False) else: # default: run baseline log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) #if model_name == 'coattn': # loss = nn.CrossEntropyLoss()(log_p1, y1) + nn.CrossEntropyLoss()(log_p2, y2) #else: # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores #if model_name != 'coattn': # p1, p2 = log_p1.exp(), log_p2.exp() #else: # p1, p2 = log_p1, log_p2 p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def load(path): from ujson import load as json_load with open(path, 'r') as pf: return GridSearch(json_load(pf))
def train_QaNet(args): device, args.gpu_ids = util.get_available_devices() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") word_mat = util.torch_from_json(args.word_emb_file) char_mat = util.torch_from_json(args.char_emb_file) with open(args.dev_eval_file, 'r') as fh: dev_eval_file = json_load(fh) print("Building model...") train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_dataset = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_dataset = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) lr = args.lr base_lr = 1 lr_warm_up_num = args.lr_warm_up_num model = QaNet(word_mat, char_mat, args.connector_dim, args.glove_dim, args.char_dim, args.drop_prob, args.dropout_char, args.num_heads, args.c_len, args.q_len).to(device) ema = util.EMA(model, args.ema_decay) parameters = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=base_lr, betas=(0.9, 0.999), eps=1e-7, weight_decay=5e-8, params=parameters) cr = lr / math.log2(lr_warm_up_num) scheduler = optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr) best_f1 = 0 best_em = 0 patience = 0 unused = False for iter in range(args.num_epochs): train(model, optimizer, scheduler, train_dataset, dev_dataset, dev_eval_file, iter, ema, device) ema.assign(model) metrics = test(model, dev_dataset, dev_eval_file, (iter + 1) * len(train_dataset)) dev_f1 = metrics["f1"] dev_em = metrics["exact_match"] if dev_f1 < best_f1 and dev_em < best_em: patience += 1 if patience > args.early_stop: break else: patience = 0 best_f1 = max(best_f1, dev_f1) best_em = max(best_em, dev_em) fn = os.path.join(args.save_dir, "model.pt") torch.save(model, fn) ema.resume(model)
def main(args): args.save_dir = util.get_save_dir(args.save_dir, "exp1_training", training=False) log = get_logger(args.logging_dir, "exp1_training") log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, c.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') dataset = SQuAD(args.test_record_file, True) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.datasplit} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission with open(args.test_eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, c.max_ans_len, True) # Log info progress_bar.update(batch_size) # Not using the unlabeled test set # if args.split != 'test': # # No labels for the test set, so NLL would be invalid # progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), True) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) results = util.eval_dicts(gold_dict, pred_dict, True) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.datasplit} {results_str}') # Log to TensorBoard tbx = SummaryWriter(c.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.test_eval_file, step=0, split=args.datasplit, num_visuals=args.num_visuals)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) ch_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, ch_vectors=ch_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args): # Set up logging args.save_dir = utilz.get_save_dir(args.save_dir, args.name, training=False) log = utilz.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = utilz.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = utilz.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') #model = BiDAF(word_vectors=word_vectors, # hidden_size=args.hidden_size) ## QANet # load word vectors wv_tensor = torch.FloatTensor( np.array(pickle_load_large_file('./data/processed/SQuAD/word_emb.pkl'), dtype=np.float32)) cv_tensor = torch.FloatTensor( np.array(pickle_load_large_file('./data/processed/SQuAD/char_emb.pkl'), dtype=np.float32)) wv_word2ix = pickle_load_large_file('./data/processed/SQuAD/word_dict.pkl') # construct model model = QANet(wv_tensor, cv_tensor, 400, 50, 128, num_head=8, train_cemb=False, pad=wv_word2ix["<PAD>"]) ## QANet End # model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = utilz.load_model(model, args.load_path, gpu_ids, return_step=False) #model = model.to(device) #ema = EMA(0.9999) #ema.assign(model) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = utilz.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward p1, p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) p1 = F.softmax(p1, dim=1) p2 = F.softmax(p2, dim=1) y1, y2 = y1.to(device), y2.to(device) loss1 = torch.nn.CrossEntropyLoss()(p1, y1) loss2 = torch.nn.CrossEntropyLoss()(p2, y2) loss = torch.mean(loss1 + loss2) nll_meter.update(loss.item(), batch_size) #starts, ends = utilz.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) #outer = torch.matmul(p1.unsqueeze(2), p2.unsqueeze(1)) #for j in range(outer.size()[0]): # outer[j] = torch.triu(outer[j]) # outer[j] = torch.tril(outer[j], self.args.ans_limit) #a1, _ = torch.max(outer, dim=2) #a2, _ = torch.max(outer, dim=1) #ymin = torch.argmax(a1, dim=1) #iymax = torch.argmax(a2, dim=1) #idx2pred, uuid2pred = utilz.convert_tokens(gold_dict, ids.tolist(), ymin.tolist(), ymax.tolist(),args.use_squad_v2) #idx2pred = {} #uuid2pred = {} #for qid, p1, p2 in zip(ids.tolist(), starts.tolist(), ends.tolist()): # context = gold_dict[str(qid)]["context"] # spans = gold_dict[str(qid)]["spans"] # uuid = gold_dict[str(qid)]["uuid"] # if args.use_squad_v2 and (p1 == 0 or p2 == 0): # idx2pred[str(qid)] = '' # uuid2pred[uuid] = '' # else: # p1, p2 = p1-1, p2-1 # start_idx = spans[p1][0] # end_idx = spans[p2][1] # idx2pred[str(qid)] = context[start_idx: end_idx] # uuid2pred[uuid] = context[start_idx: end_idx] # Get F1 and EM scores starts, ends = utilz.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = utilz.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = utilz.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) utilz.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, model_name="", a1=0.5, a2=0.5): meter = util.AverageMeter() # setup losses bceLoss = nn.BCELoss() ceLoss = nn.CrossEntropyLoss() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward y1, y2 = y1.to(device), y2.to(device) if model_name == 'sketchy': yi = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = bceLoss(yi, torch.where( y1 == 0, 0, 1).type(torch.FloatTensor)) meter.update(loss.item(), batch_size) starts, ends = [[0 if yi[i] == 0 else 1 for i, y in enumerate( y1)], [0 if yi[i] == 0 else 2 for i, y in enumerate(y2)]] elif model_name == 'intensive': yi, log_p1, log_p2 = model( cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = a1 * bceLoss(yi, torch.where(y1 == 0, 0, 1).type( torch.FloatTensor)) + a2 * (F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)) #loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) meter.update(loss.item(), batch_size) # Get F1 and EM scores p1 = log_p1.exp() p2 = log_p2.exp() # print(p1[0,:]) # print(p1) # print(p2[0,:]) # print(p2) starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) starts, ends = starts.tolist(), ends.tolist() elif model_name == 'retro': log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) meter.update(loss.item(), batch_size) p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) starts, ends = starts.tolist(), ends.tolist() else: raise ValueError( 'invalid --model_name, sketchy or intensive required') print("starts: ", starts, "Truth", y1) print("ends: ", ends, "Truth: ", y2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(loss_calc=meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts, ends, use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('Loss', meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) #log = util.get_logger(args.save_dir, args.name) #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings print('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model print('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) print(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader print('Building dataset...') #record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD("./data/my_test.npz", args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate print(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission #eval_file = vars(args)[f'{args.split}_eval_file'] with open("./data/my_test_eval.json", 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: print("viewing the dataset") print(cw_idxs, cc_idxs, qw_idxs, qc_idxs) # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) #if args.split != 'test': # No labels for the test set, so NLL would be invalid #progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) print("my evaluation ....") for el in pred_dict: print(el, pred_dict[el]) for el in sub_dict: print(el, sub_dict[el])
def get_bpe(args): bpe = BPE() with open(args.bpe_file, "r") as file: bpe.load_state_dict(json_load(file)) add_special_tokens(args) return bpe
from args import get_train_args from collections import OrderedDict from json import dumps from models import BiDAF from tensorboardX import SummaryWriter from tqdm import tqdm from ujson import load as json_load from util import collate_fn, SQuAD starter_gold_file = "/Users/zhangyue/Desktop/starter_gold.json" starter_pred_file = "/Users/zhangyue/Desktop/starter_pred.json" bert_pred_file = "/Users/zhangyue/Desktop/All/18-19_Winter/CS224N/Project/squad_cs224n/bert_train/bert_eval/predictions.json" with open(starter_gold_file, 'r') as fh: starter_gold_dict = json_load(fh) with open(starter_pred_file, 'r') as fh: starter_pred_dict = json_load(fh) with open(bert_pred_file, 'r') as fh: bert_pred_file = json_load(fh) bert_gold_dict = {} gold_dict = {} pred_dict = {} sub_dict = {} # Use uuid as dictionary key to construct the gold_dict for bert for key, value in starter_gold_dict.items(): bert_gold_dict[value['uuid']] = value
def main(args): # Load TF-IDF from pickle scorer = TFIDF([]) scorer.get_from_pickle() # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get data loader log.info('Building dataset...') record_file = vars(args)['{}_record_file'.format(args.split)] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vocab_size= 1376, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Evaluate log.info('Evaluating on {} split...'.format(args.split)) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)['{}_eval_file'.format(args.split)] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs,qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) if (args.use_tfidf): # Apply TF-IDF filtering to pred_dict tf_idf_threshold = 2 tf_idf_common_threshold = 1 for key, value in pred_dict.items(): if value != "": tf_idf_score = scorer.normalized_additive_idf_ignore_common_words( value, threshold_frequency=tf_idf_common_threshold) if tf_idf_score < tf_idf_threshold: pred_dict[key] = '' pass # print ("pred_dict: {}, pruned".format(tf_idf_score)) else: pass # print ("pred_dict: {}, kept".format(tf_idf_score)) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('{} {}'.format(args.split.title(), results_str)) # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info('Writing submission file to {}...'.format(sub_path)) with open(sub_path, 'w') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def load_eval_file(args, eval_file): with open(preprocessed_path(eval_file, args.data_dir, args.dataset), 'r') as fh: from ujson import load as json_load return json_load(fh)
def eval_results(all_examples, all_features, all_results, eval_gold_file, n_best_size, max_answer_length, do_lower_case, verbose_logging, version_2_with_negative, null_score_diff_threshold): """Write final predictions to the json file and log-odds of null if needed.""" example_index_to_features = collections.defaultdict(list) for feature in all_features: example_index_to_features[feature.example_index].append(feature) unique_id_to_result = {} for result in all_results: unique_id_to_result[result.unique_id] = result _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", [ "feature_index", "start_index", "end_index", "start_logit", "end_logit" ]) all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() for (example_index, example) in enumerate(all_examples): features = example_index_to_features[example_index] prelim_predictions = [] # keep track of the minimum score of null start+end of position 0 score_null = 1000000 # large and positive min_null_feature_index = 0 # the paragraph slice with min mull score null_start_logit = 0 # the start logit at the slice with min null score null_end_logit = 0 # the end logit at the slice with min null score for (feature_index, feature) in enumerate(features): result = unique_id_to_result[feature.unique_id] start_indexes = _get_best_indexes(result.start_logits, n_best_size) end_indexes = _get_best_indexes(result.end_logits, n_best_size) # if we could have irrelevant answers, get the min score of irrelevant if version_2_with_negative: feature_null_score = result.start_logits[ 0] + result.end_logits[0] if feature_null_score < score_null: score_null = feature_null_score min_null_feature_index = feature_index null_start_logit = result.start_logits[0] null_end_logit = result.end_logits[0] for start_index in start_indexes: for end_index in end_indexes: # We could hypothetically create invalid predictions, e.g., predict # that the start of the span is in the question. We throw out all # invalid predictions. if start_index >= len(feature.tokens): continue if end_index >= len(feature.tokens): continue if start_index not in feature.token_to_orig_map: continue if end_index not in feature.token_to_orig_map: continue if not feature.token_is_max_context.get( start_index, False): continue if end_index < start_index: continue length = end_index - start_index + 1 if length > max_answer_length: continue prelim_predictions.append( _PrelimPrediction( feature_index=feature_index, start_index=start_index, end_index=end_index, start_logit=result.start_logits[start_index], end_logit=result.end_logits[end_index])) if version_2_with_negative: prelim_predictions.append( _PrelimPrediction(feature_index=min_null_feature_index, start_index=0, end_index=0, start_logit=null_start_logit, end_logit=null_end_logit)) prelim_predictions = sorted(prelim_predictions, key=lambda x: (x.start_logit + x.end_logit), reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_logit", "end_logit"]) seen_predictions = {} nbest = [] for pred in prelim_predictions: if len(nbest) >= n_best_size: break feature = features[pred.feature_index] if pred.start_index > 0: # this is a non-null prediction tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. tok_text = tok_text.replace(" ##", "") tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue seen_predictions[final_text] = True else: final_text = "" seen_predictions[final_text] = True nbest.append( _NbestPrediction(text=final_text, start_logit=pred.start_logit, end_logit=pred.end_logit)) # if we didn't include the empty option in the n-best, include it if version_2_with_negative: if "" not in seen_predictions: nbest.append( _NbestPrediction(text="", start_logit=null_start_logit, end_logit=null_end_logit)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. if not nbest: nbest.append( _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) assert len(nbest) >= 1 total_scores = [] best_non_null_entry = None for entry in nbest: total_scores.append(entry.start_logit + entry.end_logit) if not best_non_null_entry: if entry.text: best_non_null_entry = entry probs = _compute_softmax(total_scores) nbest_json = [] for (i, entry) in enumerate(nbest): output = collections.OrderedDict() output["text"] = entry.text output["probability"] = probs[i] output["start_logit"] = entry.start_logit output["end_logit"] = entry.end_logit nbest_json.append(output) assert len(nbest_json) >= 1 if not version_2_with_negative: all_predictions[example.qas_id] = nbest_json[0]["text"] else: # predict "" iff the null score - the score of best non-null > threshold score_diff = score_null - best_non_null_entry.start_logit - ( best_non_null_entry.end_logit) scores_diff_json[example.qas_id] = score_diff if score_diff > null_score_diff_threshold: all_predictions[example.qas_id] = "" else: all_predictions[example.qas_id] = best_non_null_entry.text all_nbest_json[example.qas_id] = nbest_json with open(eval_gold_file, "r") as fh: starter_gold_dict = json_load(fh) bert_gold_dict = {} gold_dict = {} pred_dict = {} # Use uuid as dictionary key to construct the gold_dict for bert for key, value in starter_gold_dict.items(): bert_gold_dict[value['uuid']] = value # Filter out bert_pred_file bert_pred_file = all_predictions counter = 1 for key, value in bert_pred_file.items(): if key in bert_gold_dict.keys(): pred_dict[str(counter)] = value gold_dict[str(counter)] = bert_gold_dict[key] counter += 1 results = eval_dicts(gold_dict, pred_dict, no_answer=True) return results
log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() y_pred = model(cw_idxs, qn_idxs) return y_pred if __name__ == '__main__': context = input("Enter the context: ") question = input("Enter a question: ") data = [context, question] word2idx_dict = json_load(".data/word2idx.json") char2idx_dict = json_load(".data/char2idx.json") is_test = True context_idxs, context_char_idxs, ques_idxs, ques_char_idxs = convert_to_features( get_test_args(), data, word2idx_dict, char2idx_dict, is_test) predict(get_test_args(), context_idxs, ques_idxs)
def main(args): # Set up args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) device, gpu_ids = util.get_available_devices() # Get embeddings log.info('Loading embeddings...') with open('data/meta.msgpack', 'rb') as f: meta = msgpack.load(f, encoding='utf8') embedding = torch.Tensor(meta['embedding']) opt = vars(args) opt['pretrained_words'] = True opt['vocab_size'] = embedding.size(0) opt['embedding_dim'] = embedding.size(1) opt['pos_size'] = len(meta['vocab_tag']) opt['ner_size'] = len(meta['vocab_ent']) # Get model log.info('Building model...') model = DRQA(opt, embedding=embedding) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() BatchGen.pos_size = opt['pos_size'] BatchGen.ner_size = opt['ner_size'] # Get data loader log.info('Building dataset...') with open(opt['data_file'], 'rb') as f: data = msgpack.load(f, encoding='utf8') test = data['test'] # Evaluate log.info(f'Evaluating on test split...') batches = BatchGen(test, batch_size=args.batch_size, evaluation=True, is_test=True, gpu=args.cuda) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = './data/test_eval.json' with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(test)) as progress_bar: model.eval() for i, batch in enumerate(batches): # Setup for forward inputs = [e.to(device) for e in batch[:7]] ids = batch[-1] # Forward with torch.no_grad(): score_s, score_e = model(*inputs) p1, p2 = score_s, score_e starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(args.batch_size) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def preprocess_eval(eval_file): with open(eval_file, 'r') as fh: gold_dict = json_load(fh) return gold_dict
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') nbr_model = 0 if (args.load_path_baseline): model_baseline = Baseline(word_vectors=word_vectors, hidden_size=100) model_baseline = nn.DataParallel(model_baseline, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_baseline}...') model_baseline = util.load_model(model_baseline, args.load_path_baseline, gpu_ids, return_step=False) model_baseline = model_baseline.to(device) model_baseline.eval() nll_meter_baseline = util.AverageMeter() nbr_model += 1 save_prob_baseline_start = [] save_prob_baseline_end = [] if (args.load_path_bidaf): model_bidaf = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size) model_bidaf = nn.DataParallel(model_bidaf, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_bidaf}...') model_bidaf = util.load_model(model_bidaf, args.load_path_bidaf, gpu_ids, return_step=False) model_bidaf = model_bidaf.to(device) model_bidaf.eval() nll_meter_bidaf = util.AverageMeter() nbr_model += 1 save_prob_bidaf_start = [] save_prob_bidaf_end = [] if (args.load_path_bidaf_fusion): model_bidaf_fu = BiDAF_fus(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size) model_bidaf_fu = nn.DataParallel(model_bidaf_fu, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_bidaf_fusion}...') model_bidaf_fu = util.load_model(model_bidaf_fu, args.load_path_bidaf_fusion, gpu_ids, return_step=False) model_bidaf_fu = model_bidaf_fu.to(device) model_bidaf_fu.eval() nll_meter_bidaf_fu = util.AverageMeter() nbr_model += 1 save_prob_bidaf_fu_start = [] save_prob_bidaf_fu_end = [] if (args.load_path_qanet): model_qanet = QANet(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet = nn.DataParallel(model_qanet, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet}...') model_qanet = util.load_model(model_qanet, args.load_path_qanet, gpu_ids, return_step=False) model_qanet = model_qanet.to(device) model_qanet.eval() nll_meter_qanet = util.AverageMeter() nbr_model += 1 save_prob_qanet_start = [] save_prob_qanet_end = [] if (args.load_path_qanet_old): model_qanet_old = QANet_old(word_vectors=word_vectors, char_vectors=char_vectors, device=device, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks) model_qanet_old = nn.DataParallel(model_qanet_old, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_old}...') model_qanet_old = util.load_model(model_qanet_old, args.load_path_qanet_old, gpu_ids, return_step=False) model_qanet_old = model_qanet_old.to(device) model_qanet_old.eval() nll_meter_qanet_old = util.AverageMeter() nbr_model += 1 save_prob_qanet_old_start = [] save_prob_qanet_old_end = [] if (args.load_path_qanet_inde): model_qanet_inde = QANet_independant_encoder( word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet_inde = nn.DataParallel(model_qanet_inde, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_inde}...') model_qanet_inde = util.load_model(model_qanet_inde, args.load_path_qanet_inde, gpu_ids, return_step=False) model_qanet_inde = model_qanet_inde.to(device) model_qanet_inde.eval() nll_meter_qanet_inde = util.AverageMeter() nbr_model += 1 save_prob_qanet_inde_start = [] save_prob_qanet_inde_end = [] if (args.load_path_qanet_s_e): model_qanet_s_e = QANet_S_E(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet_s_e = nn.DataParallel(model_qanet_s_e, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_s_e}...') model_qanet_s_e = util.load_model(model_qanet_s_e, args.load_path_qanet_s_e, gpu_ids, return_step=False) model_qanet_s_e = model_qanet_s_e.to(device) model_qanet_s_e.eval() nll_meter_qanet_s_e = util.AverageMeter() nbr_model += 1 save_prob_qanet_s_e_start = [] save_prob_qanet_s_e_end = [] # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) y1, y2 = y1.to(device), y2.to(device) l_p1, l_p2 = [], [] # Forward if (args.load_path_baseline): log_p1_baseline, log_p2_baseline = model_baseline( cw_idxs, cc_idxs) loss_baseline = F.nll_loss(log_p1_baseline, y1) + F.nll_loss( log_p2_baseline, y2) nll_meter_baseline.update(loss_baseline.item(), batch_size) l_p1 += [log_p1_baseline.exp()] l_p2 += [log_p2_baseline.exp()] if (args.save_probabilities): save_prob_baseline_start += [ log_p1_baseline.exp().detach().cpu().numpy() ] save_prob_baseline_end += [ log_p2_baseline.exp().detach().cpu().numpy() ] if (args.load_path_qanet): log_p1_qanet, log_p2_qanet = model_qanet( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet = F.nll_loss(log_p1_qanet, y1) + F.nll_loss( log_p2_qanet, y2) nll_meter_qanet.update(loss_qanet.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet.exp()] l_p2 += [log_p2_qanet.exp()] if (args.save_probabilities): save_prob_qanet_start += [ log_p1_qanet.exp().detach().cpu().numpy() ] save_prob_qanet_end += [ log_p2_qanet.exp().detach().cpu().numpy() ] if (args.load_path_qanet_old): log_p1_qanet_old, log_p2_qanet_old = model_qanet_old( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_old = F.nll_loss(log_p1_qanet_old, y1) + F.nll_loss( log_p2_qanet_old, y2) nll_meter_qanet_old.update(loss_qanet_old.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_old.exp()] l_p2 += [log_p2_qanet_old.exp()] if (args.save_probabilities): save_prob_qanet_old_start += [ log_p1_qanet_old.exp().detach().cpu().numpy() ] save_prob_qanet_old_end += [ log_p2_qanet_old.exp().detach().cpu().numpy() ] if (args.load_path_qanet_inde): log_p1_qanet_inde, log_p2_qanet_inde = model_qanet_inde( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_inde = F.nll_loss( log_p1_qanet_inde, y1) + F.nll_loss(log_p2_qanet_inde, y2) nll_meter_qanet_inde.update(loss_qanet_inde.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_inde.exp()] l_p2 += [log_p2_qanet_inde.exp()] if (args.save_probabilities): save_prob_qanet_inde_start += [ log_p1_qanet_inde.exp().detach().cpu().numpy() ] save_prob_qanet_inde_end += [ log_p2_qanet_inde.exp().detach().cpu().numpy() ] if (args.load_path_qanet_s_e): log_p1_qanet_s_e, log_p2_qanet_s_e = model_qanet_s_e( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_s_e = F.nll_loss(log_p1_qanet_s_e, y1) + F.nll_loss( log_p2_qanet_s_e, y2) nll_meter_qanet_s_e.update(loss_qanet_s_e.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_s_e.exp()] l_p2 += [log_p2_qanet_s_e.exp()] if (args.save_probabilities): save_prob_qanet_s_e_start += [ log_p1_qanet_s_e.exp().detach().cpu().numpy() ] save_prob_qanet_s_e_end += [ log_p2_qanet_s_e.exp().detach().cpu().numpy() ] if (args.load_path_bidaf): log_p1_bidaf, log_p2_bidaf = model_bidaf( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_bidaf = F.nll_loss(log_p1_bidaf, y1) + F.nll_loss( log_p2_bidaf, y2) nll_meter_bidaf.update(loss_bidaf.item(), batch_size) l_p1 += [log_p1_bidaf.exp()] l_p2 += [log_p2_bidaf.exp()] if (args.save_probabilities): save_prob_bidaf_start += [ log_p1_bidaf.exp().detach().cpu().numpy() ] save_prob_bidaf_end += [ log_p2_bidaf.exp().detach().cpu().numpy() ] if (args.load_path_bidaf_fusion): log_p1_bidaf_fu, log_p2_bidaf_fu = model_bidaf_fu( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_bidaf_fu = F.nll_loss(log_p1_bidaf_fu, y1) + F.nll_loss( log_p2_bidaf_fu, y2) nll_meter_bidaf_fu.update(loss_bidaf_fu.item(), batch_size) l_p1 += [log_p1_bidaf_fu.exp()] l_p2 += [log_p2_bidaf_fu.exp()] if (args.save_probabilities): save_prob_bidaf_fu_start += [ log_p1_bidaf_fu.exp().detach().cpu().numpy() ] save_prob_bidaf_fu_end += [ log_p2_bidaf_fu.exp().detach().cpu().numpy() ] p1, p2 = l_p1[0], l_p2[0] for i in range(1, nbr_model): p1 += l_p1[i] p2 += l_p2[i] p1 /= nbr_model p2 /= nbr_model starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid if (args.load_path_qanet): progress_bar.set_postfix(NLL=nll_meter_qanet.avg) elif (args.load_path_bidaf): progress_bar.set_postfix(NLL=nll_meter_bidaf.avg) elif (args.load_path_bidaf_fusion): progress_bar.set_postfix(NLL=nll_meter_bidaf_fu.avg) elif (args.load_path_qanet_old): progress_bar.set_postfix(NLL=nll_meter_qanet_old.avg) elif (args.load_path_qanet_inde): progress_bar.set_postfix(NLL=nll_meter_qanet_inde.avg) elif (args.load_path_qanet_s_e): progress_bar.set_postfix(NLL=nll_meter_qanet_s_e.avg) else: progress_bar.set_postfix(NLL=nll_meter_baseline.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) if (args.save_probabilities): if (args.load_path_baseline): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_baseline_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_baseline_end, fp) if (args.load_path_bidaf): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_end, fp) if (args.load_path_bidaf_fusion): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_fu_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_fu_end, fp) if (args.load_path_qanet): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_end, fp) if (args.load_path_qanet_old): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_old_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_old_end, fp) if (args.load_path_qanet_inde): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_inde_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_inde_end, fp) if (args.load_path_qanet_s_e): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_s_e_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_s_e_end, fp) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) if (args.load_path_qanet): meter_avg = nll_meter_qanet.avg elif (args.load_path_bidaf): meter_avg = nll_meter_bidaf.avg elif (args.load_path_bidaf_fusion): meter_avg = nll_meter_bidaf_fu.avg elif (args.load_path_qanet_inde): meter_avg = nll_meter_qanet_inde.avg elif (args.load_path_qanet_s_e): meter_avg = nll_meter_qanet_s_e.avg elif (args.load_path_qanet_old): meter_avg = nll_meter_qanet_old.avg else: meter_avg = nll_meter_baseline.avg results_list = [('NLL', meter_avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def test(args): # Set up logging log = util.get_logger(args.save_dir, args.name) log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info("Loading embeddings...") word_vectors = util.torch_from_json(args.word_emb_file) # TODO: Hardcode padding_idx padding_idx = 0 # Get model log.info("Building model...") model = WordTransformerQA( dim=args.dim, n_heads=args.n_heads, ff_dim=args.ff_dim, activation=args.activation, dropout=args.dropout, attn_dropout=args.attn_dropout, act_dropout=args.act_dropout, n_layers=args.n_layers, max_positions=args.max_positions, word_vectors=word_vectors, ) model = nn.DataParallel(model, gpu_ids) log.info(f"Loading checkpoint from {args.load_path}...") model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info("Building dataset...") record_file = vars(args)[f"{args.split}_record_file"] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader( dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, ) # Evaluate log.info(f"Evaluating on {args.split} split...") nll_meter = stats.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f"{args.split}_eval_file"] with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: batch_size = cw_idxs.size(0) _, loss_val, scores = forward(cw_idxs, qw_idxs, y1, y2, padding_idx, args, device, model) nll_meter.update(loss_val, batch_size) # Get F1 and EM scores p1, p2 = model.module.get_prob(scores).split(1, dim=-1) p1, p2 = p1.squeeze(-1), p2.squeeze(-1) starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != "test": # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens( gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2, ) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != "test": results = eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [ ("NLL", nll_meter.avg), ("F1", results["F1"]), ("EM", results["EM"]), ] if args.use_squad_v2: results_list.append(("AvNA", results["AvNA"])) results = OrderedDict(results_list) # Log to console results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items()) log.info(f"{args.split.title()} {results_str}") # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize( tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals, ) # Write submission file if args.split == "dev": sub_path = join(args.save_dir, "val" + "_" + args.sub_file) else: sub_path = join(args.save_dir, args.split + "_" + args.sub_file) log.info(f"Writing submission file to {sub_path}...") eval.write_submission(sub_path, sub_dict)
def main(args): # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get size of char vocab with open(args.char2idx_file, 'r') as fh: char_vocab_size = len(json_load(fh)) # Get model log.info('Building model...') model = QANet(word_vectors=word_vectors, hidden_size=args.hidden_size, char_vocab_size = char_vocab_size, char_emb_size = args.char_emb_size, word_char_emb_size = args.word_char_emb_size, drop_prob=args.drop_prob, num_blocks_embd = args.num_blocks_embd, num_conv_embd = args.num_conv_embd, kernel_size = args.kernel_size, num_heads = args.num_heads, num_blocks_model = args.num_blocks_model, num_conv_model = args.num_conv_model, dropout_char = args.dropout_char, dropout_word = args.dropout_word, survival_prob = args.survival_prob) model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler #params = filter(lambda param: param.requires_grad, model.parameters()) optimizer = optim.Adam(lr=1, betas=(args.beta1, args.beta2), eps=args.adam_eps, weight_decay=args.l2_wd, params=model.parameters()) cr = args.lr / math.log2(args.warm_up) scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < args.warm_up else args.lr) # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) torch.autograd.set_detect_anomaly(True) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs=cc_idxs.to(device) qc_idxs=qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step(step // batch_size) ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) models = {} if args.use_ensemble: total_models = 0 for model_name in ['bidaf', 'bidafextra', 'fusionnet']: models_list = [] for model_file in glob.glob( f'{args.load_path}/{model_name}-*/{args.ensemble_models}'): # Get model log.info('Building model...') if model_name == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif model_name == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif model_name == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {model_file}...') model = util.load_model(model, model_file, gpu_ids, return_step=False) # Load each model on CPU (have plenty of RAM ...) model = model.cpu() model.eval() models_list.append(model) models[model_name] = models_list total_models += len(models_list) log.info(f'Using an ensemble of {total_models} models') else: device, gpu_ids = util.get_available_devices() # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() models[args.model] = [model] # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) p1s = [] p2s = [] for model_name in models: for model in models[model_name]: # Move model to GPU to evaluate model = model.to(device) # Forward if model_name == 'bidaf': log_p1, log_p2 = model.to(device)(cw_idxs, qw_idxs) else: log_p1, log_p2 = model.to(device)(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) log_p1, log_p2 = log_p1.cpu(), log_p2.cpu() if not args.use_ensemble: y1, y2 = y1.to(device), y2.to(device) log_p1, log_p2 = log_p1.to(device), log_p2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Move model back to CPU to release GPU memory model = model.cpu() # Get F1 and EM scores p1, p2 = log_p1.exp().unsqueeze( -1).cpu(), log_p2.exp().unsqueeze(-1).cpu() p1s.append(p1), p2s.append(p2) best_ps = torch.max( torch.cat([ torch.cat(p1s, -1).unsqueeze(-1), torch.cat(p2s, -1).unsqueeze(-1) ], -1), -2)[0] p1, p2 = best_ps[:, :, 0], best_ps[:, :, 1] starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def test(args): # Set up logging log = util.get_logger(args.save_dir, args.name) log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}") device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info("Loading embeddings...") word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info("Building model...") model = BiDAF( word_vectors=word_vectors, hidden_size=args.hidden_size, use_glove=args.use_glove, ) model = nn.DataParallel(model, gpu_ids) log.info(f"Loading checkpoint from {args.load_path}...") model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info("Building dataset...") record_file = vars(args)[f"{args.split}_record_file"] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader( dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn, ) # Evaluate log.info(f"Evaluating on {args.split} split...") nll_meter = stats.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f"{args.split}_eval_file"] with open(eval_file, "r") as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != "test": # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens( gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2, ) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != "test": results = {"NLL": nll_meter.avg} results.update(eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)) # Log to console results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items()) log.info(f"{args.split.title()} {results_str}") # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize( tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals, ) # Write submission file if args.split == "dev": sub_path = join(args.save_dir, "val" + "_" + args.sub_file) else: sub_path = join(args.save_dir, args.split + "_" + args.sub_file) log.info(f"Writing submission file to {sub_path}...") eval.write_submission(sub_path, sub_dict)