def forward(self, sketchy_prediction, intensive_prediction, log_p1, log_p2, max_len=15, use_squad_v2=True): s_in = log_p1.exp() e_in = log_p2.exp() starts, ends = discretize(s_in, e_in, max_len, use_squad_v2) # Combines answerability estimate from both the sketchy and intensive models pred_answerable = self.beta * intensive_prediction + \ (1-self.beta) * sketchy_prediction # Calcultes how certain we are of intesives prediction has = torch.tensor([ s_in[x, starts[x]] * e_in[x, ends[x]] for x in range(s_in.shape[0]) ]).to(device='cuda:0') null = (s_in[:, 0] * e_in[:, 0]).to(device='cuda:0') span_answerable = has - null # Combines our answerability with our certainty answerable = self.lam * pred_answerable + \ (1 - self.lam) * span_answerable l_p1 = log_p1.clone() l_p2 = log_p2.clone() l_p1[answerable <= self.ans] = 0 l_p2[answerable <= self.ans] = 0 return l_p1, l_p2
def evaluate(args, model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cw_pos = cw_pos.to(device) cw_ner = cw_ner.to(device) cw_freq = cw_freq.to(device) cqw_extra = cqw_extra.to(device) batch_size = cw_idxs.size(0) # Forward if args.model == 'bidaf': log_p1, log_p2 = model(cw_idxs, qw_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, args): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) ## Additions for BERT ## max_context_len, max_question_len = args.para_limit, args.ques_limit if "bert" in args.model_type: bert_dev_embeddings = get_embeddings("dev", ids, args.para_limit, args.ques_limit) else: bert_dev_embeddings = None # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_dev_embeddings, \ max_context_len, max_question_len, device) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def test1(): batch = 1 c_len = 6 start = torch.randn(batch, c_len) end = torch.randn(batch, c_len) start = torch.softmax(start, -1) end = torch.softmax(end, -1) start_ids, end_ids = util.discretize(start, end, max_len = 3, no_answer = True) return start_ids, end_ids
def get_reward_for_arm(row, arm): true_arm = util.discretize(float(row['Therapeutic Dose of Warfarin'])) reward = 0 if arm - 1 == true_arm else -1 log.debug( 'ID {} - Arm_{} Therapeutic Dose of Warfarin = {}|{} => {}'.format( row['PharmGKB Subject ID'], arm, row['Therapeutic Dose of Warfarin'], true_arm, reward)) return reward
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() # print('Memory 3: ', torch.cuda.memory_allocated()) model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # print('Memory at start of loop section: ', torch.cuda.memory_allocated()) # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # print('Memory Before Forward Pass: '******'Memory After Forward Pass: '******'Memory After Loss Calc: ', torch.cuda.memory_allocated()) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) print('Max start idx score: ', torch.max(starts)) print('Max End idx score: ', torch.max(ends)) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() loss_f = torch.nn.CrossEntropyLoss() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) mems = (tuple(), tuple(), tuple()) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2, mems = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs, *mems) y1, y2 = y1.to(device), y2.to(device) loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2)) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def AverageProbs(args, p_start, p_end, id_quest, gold_dict): pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission nbr_model = len(p_start) nbr_batch = len(p_start[0]) for j in range(nbr_batch): p1 = np.zeros(p_start[0][j].shape) p2 = np.zeros(p_start[0][j].shape) for model in range(nbr_model): p1 += p_start[model][j] p2 += p_end[model][j] p1 /= nbr_model p2 /= nbr_model ids = id_quest[j] p1 = torch.from_numpy(p1).float().cuda() p2 = torch.from_numpy(p2).float().cuda() ids = ids.cuda() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) print(f'{args.split.title()} {results_str}') # Write submission file sub_path = join("../save/test/" + split + "/submission.csv") with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): # use_squad_v2 = True nll_meter = util.AverageMeter() # Keep track of average values over time. model.eval() # Sets the module in evaluation mode. pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # ids # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) ## # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() # e^log_p1 e^log_p2 starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) ## return : start_idxs & end_idxs # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ## Convert predictions to tokens from the context. ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) ## results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def compute_arm_index(self, data): dose_inputs = torch.cat([ torch.ones( (data.size(0), 1), dtype=torch.float, device=data.device), data[:, list(column_mapping.values())] ], 1) dose_inputs.to(data.device) dose_params_tensor = torch.FloatTensor(self.dose_params).unsqueeze(-1) dose_params_tensor = dose_params_tensor.to(data.device) dose = torch.matmul(dose_inputs, dose_params_tensor) dose = dose**2 arm_indexes = torch.LongTensor(util.discretize(dose.cpu())) arm_indexes = arm_indexes.to(data.device) return arm_indexes
def process_sample(sample, model, gold_dict=None): cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids = sample batch_size = cw_idxs.size(0) log_p1, log_p2 = model(cw_idxs.to(device), cc_idxs.to(device), qw_idxs.to(device), qc_idxs.to(device)) y1, y2 = y1.to(device), y2.to(device) nll_loss_1 = F.nll_loss(log_p1, y1) nll_loss_2 = F.nll_loss(log_p2, y2) loss = nll_loss_1 + nll_loss_2 preds = None if gold_dict: p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) return loss, batch_size, preds
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) cc_idxs = cc_idxs.to(device) qw_idxs = qw_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cc_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) # Log info model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def WeightedAverage(args, p_start, p_end, weigths, id_quest, gold_dict): pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission nbr_model = len(p_start) nbr_batch = len(p_start[0]) for j in range(nbr_batch): p1 = np.zeros(p_start[0][j].shape) p2 = np.zeros(p_start[0][j].shape) for model in range(nbr_model): p1 += weigths[model] * p_start[model][j] p2 += weigths[model] * p_end[model][j] ids = id_quest[j] p1 = torch.from_numpy(p1).float().cuda() p2 = torch.from_numpy(p2).float().cuda() ids = ids.cuda() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) print(f'{args.split.title()} {results_str}')
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, enable_EM, enable_posner): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids, cwf, lemma_indicators, c_posner, q_posner in data_loader: # Setup for forward cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) if enable_EM: cwf = cwf.to(device) #qwf = qwf.to(device) lemma_indicators = lemma_indicators.to(device) else: cwf = None lemma_indicators = None if enable_posner: c_posner = c_posner.to(device) q_posner = q_posner.to(device) else: q_posner = None c_posner = None batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs, cwf, lemma_indicators, c_posner, q_posner) y1, y2 = y1.to(device), y2.to(device) weight = torch.ones(log_p1.size(1), device=device) weight[0] = 5 loss = F.nll_loss(log_p1, y1, weight=weight) + F.nll_loss( log_p2, y2, weight=weight) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) #log = util.get_logger(args.save_dir, args.name) #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings print('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model print('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) print(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader print('Building dataset...') #record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD("./data/my_test.npz", args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate print(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission #eval_file = vars(args)[f'{args.split}_eval_file'] with open("./data/my_test_eval.json", 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: print("viewing the dataset") print(cw_idxs, cc_idxs, qw_idxs, qc_idxs) # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) #if args.split != 'test': # No labels for the test set, so NLL would be invalid #progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) print("my evaluation ....") for el in pred_dict: print(el, pred_dict[el]) for el in sub_dict: print(el, sub_dict[el])
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # print("*"*80) # print(len(dataset.question_idxs)) # for question_idx in dataset.question_idxs: # print(question_idx) # print("*" * 80) # print(self.question_idxs[question_idx]) # self.question_idxs[idx] # print("data_loader: ",data_loader) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) # create statistics # print("*"*80) # print(len(gold_dict)) # print(gold_dict['1']['question']) count_questions_type = defaultdict(lambda: 0) audit_trail_from_question_type = defaultdict(lambda: []) list_of_interrogative_pronouns = [ "what", "whose", "why", "which", "where", "when", "how", "who", "whom" ] for index in range(1, len(gold_dict)): # transform the question in lower case to simplify the analysis, thus losing the benefit of the capital letters # possibly indicating the position of the interrogative pronoun in the sentence. question_lower_case = gold_dict[str(index)]['question'].lower() list_question_lower_case_with_punctuation = question_lower_case.translate( {ord(i): " " for i in "'"}).split() # question_lower_case = [] for item in list_question_lower_case_with_punctuation: question_lower_case.append( item.translate({ord(i): "" for i in ",.<>!@£$%^&*()_-+=?"})) # defining a variable for the first word first_word_question_lower_case = question_lower_case[0] # defining variable for the second word second_word_question_lower_case = question_lower_case[1] # defining variable for the first and second word combined_first_and_second_words = first_word_question_lower_case + " " + second_word_question_lower_case #printing on the screen test for debugging purpose # Analyzing the sentence if first_word_question_lower_case in list_of_interrogative_pronouns: count_questions_type[first_word_question_lower_case] += 1 audit_trail_from_question_type[ first_word_question_lower_case].append(str(index)) # composed question starting by in elif first_word_question_lower_case == "in": if second_word_question_lower_case in list_of_interrogative_pronouns and second_word_question_lower_case != "whose": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # composed question starting by by elif first_word_question_lower_case == "by": if second_word_question_lower_case in list_of_interrogative_pronouns \ and second_word_question_lower_case !="whom"\ and second_word_question_lower_case !="which"\ and second_word_question_lower_case !="when"\ and second_word_question_lower_case !="how": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) #if pronoun =="": # print(">>", question_lower_case) # print("@@@", gold_dict[str(index)]['question']) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # if pronoun =="": # print(">>", question_lower_case.split()) # print() #if first_word_question_lower_case == "if": # print(">>", question_lower_case.split()) # print(count_questions_type) # if gold_dict[str(index)]['question'].lower().split()[0] == "in": # print(gold_dict[str(index)]['question']) reverse_dict_by_value = OrderedDict( sorted(count_questions_type.items(), key=lambda x: x[1])) # print(count_questions_type) total_questions = sum(count_questions_type.values()) # print(reverse_dict) #for k, v in reverse_dict_by_value.items(): # print( "%s: %s and in percentage: %s" % (k, v, 100*v/total_questions)) #print(audit_trail_from_question_type) # exit() with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if args.model == 'bidaf': log_p1, log_p2 = model(cw_idxs, qw_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) # Printing information for questions without interrogative pronouns """" print("len(gold_dict): ", len(gold_dict)) print("len(pred_dict): ", len(pred_dict)) print("Is gold_dict.keys() identical to pred_dict.keys(): ", gold_dict.keys()==pred_dict.keys()) if gold_dict.keys()!=pred_dict.keys(): for key in gold_dict.keys(): if key not in pred_dict.keys(): print("key ", key, " missing in pred_dict.keys(") """ results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Computing the F1 score for each type of question # # audit_trail_from_question_type[pronoun].append(str(index)) # create a list of the types of questions by extracting the keys from the dict audit_trail_from_question_type types_of_questions = list(audit_trail_from_question_type.keys()) gold_dict_per_type_of_questions = defaultdict(lambda: []) pred_dict_per_type_of_questions = {} gold_dict_per_type_of_questions_start = {} pred_dict_per_type_of_questions_start = {} gold_dict_per_type_of_questions_middle = {} pred_dict_per_type_of_questions_middle = {} gold_dict_per_type_of_questions_end = {} pred_dict_per_type_of_questions_end = {} for type_of_questions in types_of_questions: #gold_pred = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions]} #lst_pred = {key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions]} # Create two dictionnaries for each type of sentence for gold_dict_per_type_of_questions and pred_dict_per_type_of_questions gold_dict_per_type_of_questions[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1']) gold_dict_per_type_of_questions_start[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_start[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } gold_dict_per_type_of_questions_middle[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_middle[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } gold_dict_per_type_of_questions_end[type_of_questions] = { key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } pred_dict_per_type_of_questions_end[type_of_questions] = { key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys() } for key, value in gold_dict.items(): #if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys(): if key in audit_trail_from_question_type[ type_of_questions] and type_of_questions != "" and key in pred_dict_per_type_of_questions[ type_of_questions]: """ print("type_of_questions: ",type_of_questions) print("key: ", key) print("question: ", value["question"]) sub_index = value["question"].lower().find(type_of_questions) print("sub_index: ",sub_index) test_fc = value["question"].lower().find(type_of_questions) print("present type of the var: ",type(test_fc)) #print("question: ", value["question"][str(key)]) print("length of the question: ", len(value["question"])) print('Position of the interrogative pronoun in the question:', ) """ # Create two dictionnaries for each type of sentence based at the start of the sentence if value["question"].lower().find( type_of_questions) == 1 or value["question"].lower( ).find(type_of_questions) == 0: #print("BEGINNING") if type_of_questions != "": try: del gold_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass #pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in # gold_dict_per_type_of_questions_start[ # type_of_questions].keys()} elif value["question"].lower( ).find(type_of_questions) >= len( value["question"]) - len(type_of_questions) - 5: #print("END") if type_of_questions != "": try: del gold_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_middle[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass #print("type_of_questions: ",type_of_questions) #sub_index = value["question"].lower().find(type_of_questions) #print("sub_index: ", sub_index) #print("len(value['question']) - len(type_of_questions) - 2: ", len(value["question"])-len(type_of_questions)-2) #start_string = len(value["question"])-len(type_of_questions)-6 #end_string = len(value["question"])-1 #print("extract at the end: ", value["question"][start_string:end_string]) else: #print("MIDDLE") if type_of_questions != "": try: del gold_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_start[ type_of_questions][key] except KeyError: pass try: del gold_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass try: del pred_dict_per_type_of_questions_end[ type_of_questions][key] except KeyError: pass pass """ if type_of_questions != "": gold_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in gold_dict.items() if (key in audit_trail_from_question_type[type_of_questions] \ and (value["question"].lower().find(type_of_questions) <= 1) \ and key in pred_dict_per_type_of_questions[type_of_questions]) } """ """ for key in gold_dict_per_type_of_questions_start[type_of_questions].keys(): print("key:: ", key ) print("type(key):: ", type(key) ) print("pred_dict[,key,] : ", pred_dict[key]) print("@@@@@@@@@@@@@@@@@@@@@@@@") pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in gold_dict_per_type_of_questions_start[type_of_questions].keys()} #pred_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in pred_dict.items() if key in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) } # Create two dictionnaries for each type of sentence based at the end of the sentence gold_dict_per_type_of_questions_end[type_of_questions] = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] \ and value["question"].lower().find(type_of_questions) >= len(value["question"])-len(type_of_questions)-2 \ and key in pred_dict_per_type_of_questions[type_of_questions]} pred_dict_per_type_of_questions_end[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} #print("*"*80) # Create two dictionnaries for each type of sentence based at the middle of the sentencecount_questions_type gold_dict_per_type_of_questions_middle[type_of_questions] = {key: value for key, value in gold_dict.items() if key not in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) \ and key not in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} pred_dict_per_type_of_questions_middle[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())} else: gold_dict_per_type_of_questions_start[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_start[""] = pred_dict_per_type_of_questions[""] gold_dict_per_type_of_questions_end[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_end[""] = pred_dict_per_type_of_questions[""] gold_dict_per_type_of_questions_middle[""] = gold_dict_per_type_of_questions[""] pred_dict_per_type_of_questions_middle[""] = pred_dict_per_type_of_questions[""] """ positions_in_question = ["beginning", "middle", "end"] # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1']) list_beginning = [ util.eval_dicts( gold_dict_per_type_of_questions_start[type_of_questions], pred_dict_per_type_of_questions_start[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] list_middle = [ util.eval_dicts( gold_dict_per_type_of_questions_middle[type_of_questions], pred_dict_per_type_of_questions_middle[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] list_end = [ util.eval_dicts( gold_dict_per_type_of_questions_end[type_of_questions], pred_dict_per_type_of_questions_end[type_of_questions], args.use_squad_v2)['F1'] for type_of_questions in types_of_questions ] #for type_of_questions in types_of_questions: # print("gold_dict_per_type_of_questions_start[type_of_questions]: ",gold_dict_per_type_of_questions_start[type_of_questions]) # print("pred_dict_per_type_of_questions[type_of_questions]: ",pred_dict_per_type_of_questions[type_of_questions]) F1 = np.array([list_beginning, list_middle, list_end]) m, n = F1.shape value_to_ignore = [] for i in range(m): for j in range(n): if F1[i, j] == "NA" or F1[i, j] == 0: value_to_ignore.append((i, j)) print("value to ignore: ", value_to_ignore) #F1 = np.array([[0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0], # [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0]]) data_label = copy.deepcopy(F1) for row in data_label: for column_idx in range(len(row)): if row[column_idx] == "NA": row[column_idx] = "" # print question without interrogative pronoun required for the second part of the analysis: for key, value in gold_dict.items(): if key in audit_trail_from_question_type[ ""] and key in pred_dict.keys(): print("question: ", gold_dict_per_type_of_questions['']) print("golden answers: ", ) print("prediction: ", pred_dict[key]) print() fig, ax = plt.subplots() types_of_questions[types_of_questions.index( "")] = "Implicit question without interrogative pronoun" im, cbar = heatmap(F1, positions_in_question, types_of_questions, ax=ax, \ cmap="YlGn", cbarlabel="F1 scores") texts = annotate_heatmap(im, data=data_label, valfmt="{x:.1f}", ignore=value_to_ignore) fig.tight_layout() plt.show() # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) if use_syll: with open('data/word_idx2syll_idx.json') as json_file: word_idx2syll_idx = json.load(json_file) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: batch_size = cw_idxs.size(0) # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) if use_char: # prepare character indices cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) if use_syll: # convert word index to syllable index cs_idxs = word2syll_idxs(cw_idxs, word_idx2syll_idx).to(device) qs_idxs = word2syll_idxs(qw_idxs, word_idx2syll_idx).to(device) if use_char and use_syll: log_p1, log_p2 = model(cw_idxs=cw_idxs, qw_idxs=qw_idxs, cc_idxs=cc_idxs, qc_idxs=qc_idxs, cs_idxs=cs_idxs, qs_idxs=qs_idxs) elif use_char: log_p1, log_p2 = model(cw_idxs=cw_idxs, qw_idxs=qw_idxs, cc_idxs=cc_idxs, qc_idxs=qc_idxs) elif use_syll: log_p1, log_p2 = model(cw_idxs=cw_idxs, qw_idxs=qw_idxs, cs_idxs=cs_idxs, qs_idxs=qs_idxs) else: log_p1, log_p2 = model(cw_idxs=cw_idxs, qw_idxs=qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') nbr_model = 0 if (args.load_path_baseline): model_baseline = Baseline(word_vectors=word_vectors, hidden_size=100) model_baseline = nn.DataParallel(model_baseline, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_baseline}...') model_baseline = util.load_model(model_baseline, args.load_path_baseline, gpu_ids, return_step=False) model_baseline = model_baseline.to(device) model_baseline.eval() nll_meter_baseline = util.AverageMeter() nbr_model += 1 save_prob_baseline_start = [] save_prob_baseline_end = [] if (args.load_path_bidaf): model_bidaf = BiDAF(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size) model_bidaf = nn.DataParallel(model_bidaf, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_bidaf}...') model_bidaf = util.load_model(model_bidaf, args.load_path_bidaf, gpu_ids, return_step=False) model_bidaf = model_bidaf.to(device) model_bidaf.eval() nll_meter_bidaf = util.AverageMeter() nbr_model += 1 save_prob_bidaf_start = [] save_prob_bidaf_end = [] if (args.load_path_bidaf_fusion): model_bidaf_fu = BiDAF_fus(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size) model_bidaf_fu = nn.DataParallel(model_bidaf_fu, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_bidaf_fusion}...') model_bidaf_fu = util.load_model(model_bidaf_fu, args.load_path_bidaf_fusion, gpu_ids, return_step=False) model_bidaf_fu = model_bidaf_fu.to(device) model_bidaf_fu.eval() nll_meter_bidaf_fu = util.AverageMeter() nbr_model += 1 save_prob_bidaf_fu_start = [] save_prob_bidaf_fu_end = [] if (args.load_path_qanet): model_qanet = QANet(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet = nn.DataParallel(model_qanet, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet}...') model_qanet = util.load_model(model_qanet, args.load_path_qanet, gpu_ids, return_step=False) model_qanet = model_qanet.to(device) model_qanet.eval() nll_meter_qanet = util.AverageMeter() nbr_model += 1 save_prob_qanet_start = [] save_prob_qanet_end = [] if (args.load_path_qanet_old): model_qanet_old = QANet_old(word_vectors=word_vectors, char_vectors=char_vectors, device=device, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks) model_qanet_old = nn.DataParallel(model_qanet_old, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_old}...') model_qanet_old = util.load_model(model_qanet_old, args.load_path_qanet_old, gpu_ids, return_step=False) model_qanet_old = model_qanet_old.to(device) model_qanet_old.eval() nll_meter_qanet_old = util.AverageMeter() nbr_model += 1 save_prob_qanet_old_start = [] save_prob_qanet_old_end = [] if (args.load_path_qanet_inde): model_qanet_inde = QANet_independant_encoder( word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet_inde = nn.DataParallel(model_qanet_inde, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_inde}...') model_qanet_inde = util.load_model(model_qanet_inde, args.load_path_qanet_inde, gpu_ids, return_step=False) model_qanet_inde = model_qanet_inde.to(device) model_qanet_inde.eval() nll_meter_qanet_inde = util.AverageMeter() nbr_model += 1 save_prob_qanet_inde_start = [] save_prob_qanet_inde_end = [] if (args.load_path_qanet_s_e): model_qanet_s_e = QANet_S_E(word_vectors=word_vectors, char_vectors=char_vectors, char_emb_dim=args.char_emb_dim, hidden_size=args.hidden_size, n_heads=args.n_heads, n_conv_emb_enc=args.n_conv_emb, n_conv_mod_enc=args.n_conv_mod, n_emb_enc_blocks=args.n_emb_blocks, n_mod_enc_blocks=args.n_mod_blocks, divisor_dim_kqv=args.divisor_dim_kqv) model_qanet_s_e = nn.DataParallel(model_qanet_s_e, gpu_ids) log.info(f'Loading checkpoint from {args.load_path_qanet_s_e}...') model_qanet_s_e = util.load_model(model_qanet_s_e, args.load_path_qanet_s_e, gpu_ids, return_step=False) model_qanet_s_e = model_qanet_s_e.to(device) model_qanet_s_e.eval() nll_meter_qanet_s_e = util.AverageMeter() nbr_model += 1 save_prob_qanet_s_e_start = [] save_prob_qanet_s_e_end = [] # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) y1, y2 = y1.to(device), y2.to(device) l_p1, l_p2 = [], [] # Forward if (args.load_path_baseline): log_p1_baseline, log_p2_baseline = model_baseline( cw_idxs, cc_idxs) loss_baseline = F.nll_loss(log_p1_baseline, y1) + F.nll_loss( log_p2_baseline, y2) nll_meter_baseline.update(loss_baseline.item(), batch_size) l_p1 += [log_p1_baseline.exp()] l_p2 += [log_p2_baseline.exp()] if (args.save_probabilities): save_prob_baseline_start += [ log_p1_baseline.exp().detach().cpu().numpy() ] save_prob_baseline_end += [ log_p2_baseline.exp().detach().cpu().numpy() ] if (args.load_path_qanet): log_p1_qanet, log_p2_qanet = model_qanet( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet = F.nll_loss(log_p1_qanet, y1) + F.nll_loss( log_p2_qanet, y2) nll_meter_qanet.update(loss_qanet.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet.exp()] l_p2 += [log_p2_qanet.exp()] if (args.save_probabilities): save_prob_qanet_start += [ log_p1_qanet.exp().detach().cpu().numpy() ] save_prob_qanet_end += [ log_p2_qanet.exp().detach().cpu().numpy() ] if (args.load_path_qanet_old): log_p1_qanet_old, log_p2_qanet_old = model_qanet_old( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_old = F.nll_loss(log_p1_qanet_old, y1) + F.nll_loss( log_p2_qanet_old, y2) nll_meter_qanet_old.update(loss_qanet_old.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_old.exp()] l_p2 += [log_p2_qanet_old.exp()] if (args.save_probabilities): save_prob_qanet_old_start += [ log_p1_qanet_old.exp().detach().cpu().numpy() ] save_prob_qanet_old_end += [ log_p2_qanet_old.exp().detach().cpu().numpy() ] if (args.load_path_qanet_inde): log_p1_qanet_inde, log_p2_qanet_inde = model_qanet_inde( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_inde = F.nll_loss( log_p1_qanet_inde, y1) + F.nll_loss(log_p2_qanet_inde, y2) nll_meter_qanet_inde.update(loss_qanet_inde.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_inde.exp()] l_p2 += [log_p2_qanet_inde.exp()] if (args.save_probabilities): save_prob_qanet_inde_start += [ log_p1_qanet_inde.exp().detach().cpu().numpy() ] save_prob_qanet_inde_end += [ log_p2_qanet_inde.exp().detach().cpu().numpy() ] if (args.load_path_qanet_s_e): log_p1_qanet_s_e, log_p2_qanet_s_e = model_qanet_s_e( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_qanet_s_e = F.nll_loss(log_p1_qanet_s_e, y1) + F.nll_loss( log_p2_qanet_s_e, y2) nll_meter_qanet_s_e.update(loss_qanet_s_e.item(), batch_size) # Get F1 and EM scores l_p1 += [log_p1_qanet_s_e.exp()] l_p2 += [log_p2_qanet_s_e.exp()] if (args.save_probabilities): save_prob_qanet_s_e_start += [ log_p1_qanet_s_e.exp().detach().cpu().numpy() ] save_prob_qanet_s_e_end += [ log_p2_qanet_s_e.exp().detach().cpu().numpy() ] if (args.load_path_bidaf): log_p1_bidaf, log_p2_bidaf = model_bidaf( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_bidaf = F.nll_loss(log_p1_bidaf, y1) + F.nll_loss( log_p2_bidaf, y2) nll_meter_bidaf.update(loss_bidaf.item(), batch_size) l_p1 += [log_p1_bidaf.exp()] l_p2 += [log_p2_bidaf.exp()] if (args.save_probabilities): save_prob_bidaf_start += [ log_p1_bidaf.exp().detach().cpu().numpy() ] save_prob_bidaf_end += [ log_p2_bidaf.exp().detach().cpu().numpy() ] if (args.load_path_bidaf_fusion): log_p1_bidaf_fu, log_p2_bidaf_fu = model_bidaf_fu( cw_idxs, cc_idxs, qw_idxs, qc_idxs) loss_bidaf_fu = F.nll_loss(log_p1_bidaf_fu, y1) + F.nll_loss( log_p2_bidaf_fu, y2) nll_meter_bidaf_fu.update(loss_bidaf_fu.item(), batch_size) l_p1 += [log_p1_bidaf_fu.exp()] l_p2 += [log_p2_bidaf_fu.exp()] if (args.save_probabilities): save_prob_bidaf_fu_start += [ log_p1_bidaf_fu.exp().detach().cpu().numpy() ] save_prob_bidaf_fu_end += [ log_p2_bidaf_fu.exp().detach().cpu().numpy() ] p1, p2 = l_p1[0], l_p2[0] for i in range(1, nbr_model): p1 += l_p1[i] p2 += l_p2[i] p1 /= nbr_model p2 /= nbr_model starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid if (args.load_path_qanet): progress_bar.set_postfix(NLL=nll_meter_qanet.avg) elif (args.load_path_bidaf): progress_bar.set_postfix(NLL=nll_meter_bidaf.avg) elif (args.load_path_bidaf_fusion): progress_bar.set_postfix(NLL=nll_meter_bidaf_fu.avg) elif (args.load_path_qanet_old): progress_bar.set_postfix(NLL=nll_meter_qanet_old.avg) elif (args.load_path_qanet_inde): progress_bar.set_postfix(NLL=nll_meter_qanet_inde.avg) elif (args.load_path_qanet_s_e): progress_bar.set_postfix(NLL=nll_meter_qanet_s_e.avg) else: progress_bar.set_postfix(NLL=nll_meter_baseline.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) if (args.save_probabilities): if (args.load_path_baseline): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_baseline_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_baseline_end, fp) if (args.load_path_bidaf): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_end, fp) if (args.load_path_bidaf_fusion): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_fu_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_bidaf_fu_end, fp) if (args.load_path_qanet): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_end, fp) if (args.load_path_qanet_old): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_old_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_old_end, fp) if (args.load_path_qanet_inde): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_inde_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_inde_end, fp) if (args.load_path_qanet_s_e): with open(args.save_dir + "/probs_start", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_s_e_start, fp) with open(args.save_dir + "/probs_end", "wb") as fp: #Pickling pickle.dump(save_prob_qanet_s_e_end, fp) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) if (args.load_path_qanet): meter_avg = nll_meter_qanet.avg elif (args.load_path_bidaf): meter_avg = nll_meter_bidaf.avg elif (args.load_path_bidaf_fusion): meter_avg = nll_meter_bidaf_fu.avg elif (args.load_path_qanet_inde): meter_avg = nll_meter_qanet_inde.avg elif (args.load_path_qanet_s_e): meter_avg = nll_meter_qanet_s_e.avg elif (args.load_path_qanet_old): meter_avg = nll_meter_qanet_old.avg else: meter_avg = nll_meter_baseline.avg results_list = [('NLL', meter_avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args): # Load TF-IDF from pickle scorer = TFIDF([]) scorer.get_from_pickle() # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True))) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get data loader log.info('Building dataset...') record_file = vars(args)['{}_record_file'.format(args.split)] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, char_vocab_size= 1376, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info('Loading checkpoint from {}...'.format(args.load_path)) model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Evaluate log.info('Evaluating on {} split...'.format(args.split)) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)['{}_eval_file'.format(args.split)] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs,qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) if (args.use_tfidf): # Apply TF-IDF filtering to pred_dict tf_idf_threshold = 2 tf_idf_common_threshold = 1 for key, value in pred_dict.items(): if value != "": tf_idf_score = scorer.normalized_additive_idf_ignore_common_words( value, threshold_frequency=tf_idf_common_threshold) if tf_idf_score < tf_idf_threshold: pred_dict[key] = '' pass # print ("pred_dict: {}, pruned".format(tf_idf_score)) else: pass # print ("pred_dict: {}, kept".format(tf_idf_score)) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join('{}: {:05.2f}'.format(k, v) for k, v in results.items()) log.info('{} {}'.format(args.split.title(), results_str)) # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info('Writing submission file to {}...'.format(sub_path)) with open(sub_path, 'w') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def test_model(questions, context, use_squad_v2=True, model_path="../save/training-02/best.pth.tar"): # Set up logging #args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) #log = util.get_logger(args.save_dir, args.name) #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') #args = get_test_args() device, gpu_ids = util.get_available_devices() batch_size = 64 * max(1, len(gpu_ids)) # Get embeddings #print('Loading embeddings...') word_vectors = util.torch_from_json('../data/word_emb.json') # Get model #print('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=100) model = nn.DataParallel(model, gpu_ids) #model_path = "../save/training-02/best.pth.tar" #print(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, model_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader #print('Building dataset...') #record_file = vars(args)[f'{args.split}_record_file'] # my code start here # this is a simple approch when dealing with the user date # according to your approch of creating the interface you can change this code # and also you have to check the function "process_file" in the setup.py file processed_questions = [] for index, question in enumerate(questions): processed_question = { "question": question, "id": index, "answers": [{ "answer_start": 0, "text": "never mind" }] } processed_questions.append(processed_question) source = {"paragraphs": [{"qas": processed_questions, "context": context}]} word_counter, char_counter = Counter(), Counter() with open("../data/word2idx.json", "r") as f1: word2idx_dict = json.load(f1) with open("../data/char2idx.json", "r") as f2: char2idx_dict = json.load(f2) my_test_examples, my_test_eval = process_file(source, "my_test", word_counter, char_counter) npz = build_features(my_test_examples, "my_test", word2idx_dict, char2idx_dict, is_test=True) #my code end here dataset = SQuAD(npz, use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4, collate_fn=collate_fn) # Evaluate #print(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission #eval_file = vars(args)[f'{args.split}_eval_file'] gold_dict = my_test_eval #print("gold_dict", gold_dict) #print("data_loader", data_loader) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, 15, use_squad_v2) print("starts ", starts, " ends ", ends) # Log info progress_bar.update(batch_size) #if args.split != 'test': # No labels for the test set, so NLL would be invalid #progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) #print("my evaluation ....") #for el in pred_dict: #print(el, pred_dict[el]) #for el in sub_dict: #print(el, sub_dict[el]) return pred_dict
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, model_type, quick_eval=False): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) if not quick_eval: progress_len = len(data_loader.dataset) else: progress_len = len(data_loader) with torch.no_grad(), \ tqdm(total=progress_len) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if model_type == 'BiDAF' or model_type == 'Transformer': cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs) # Forward elif model_type == 'BiDAFbase': log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, model_name, gpu_ids): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, cpos_idxs, cner_idxs, cw_ems, cw_tfs, qw_idxs, qc_idxs, qpos_idxs, qner_idxs, qw_ems, qw_tfs, y1, y2, ids in data_loader: # NEW # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if 'baseline' in model_name: log_p1, log_p2 = model(cw_idxs, qw_idxs) elif model_name == 'BiDAF_char': # Additional setup for forward cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) elif (model_name == 'BiDAF_tag') or (model_name == 'BiDAF_tag_unfrozen') or (model_name == 'BiDAF_tag_loss') or (model_name == 'BiDAF_tag_unfrozen_loss'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs) elif (model_name == 'BiDAF_tag_ext') or (model_name == 'BiDAF_tag_ext_unfrozen'): # Additional setup for forward cc_idxs = cc_idxs.to(device) cpos_idxs = cpos_idxs.to(device) cner_idxs = cner_idxs.to(device) cw_ems = cw_ems.to(device) cw_tfs = cw_tfs.to(device) qc_idxs = qc_idxs.to(device) qpos_idxs = qpos_idxs.to(device) qner_idxs = qner_idxs.to(device) qw_ems = qw_ems.to(device) qw_tfs = qw_tfs.to(device) log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs, cw_ems, qw_ems, cw_tfs, qw_tfs) elif args.name == 'coattn': max_c_len = cw_idxs.size(1) max_q_len = qw_idxs.size(1) c_len = [] q_len = [] for i in range(cw_idxs.size(0)): if len((cw_idxs[i] == 0).nonzero()) != 0: c_len_i = (cw_idxs[i] == 0).nonzero()[0].item() else: c_len_i = cw_idxs.size(1) if len((qw_idxs[i] == 0).nonzero()) != 0: q_len_i = (qw_idxs[i] == 0).nonzero()[0].item() else: q_len_i = qw_idxs.size(1) c_len.append(int(c_len_i)) q_len.append(int(q_len_i)) c_len = torch.Tensor(c_len).int() q_len = torch.Tensor(q_len).int() num_examples = int(cw_idxs.size(0) / len(gpu_ids)) log_p1, log_p2 = model(max_c_len, max_q_len, cw_idxs, qw_idxs, c_len, q_len, num_examples, True, False) else: # default: run baseline log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) #if model_name == 'coattn': # loss = nn.CrossEntropyLoss()(log_p1, y1) + nn.CrossEntropyLoss()(log_p2, y2) #else: # loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores #if model_name != 'coattn': # p1, p2 = log_p1.exp(), log_p2.exp() #else: # p1, p2 = log_p1, log_p2 p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def evaluate(model, data_loader, device, model_name, eval_file, max_len, use_squad_v2): nll_meter = util.AverageMeter() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, c_pos_idxs, c_ner_idxs,c_iob_idxs, \ qw_idxs, qc_idxs, q_pos_idxs, q_ner_idxs, q_iob_idxs,\ y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if model_name == 'BiDAF'.lower(): log_p1, log_p2 = model(cw_idxs, qw_idxs) elif model_name == "BiDAF_Char".lower(): cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs) elif model_name == "BiDAF_CharTag".lower(): cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) c_pos_idxs = c_pos_idxs.to(device) c_ner_idxs = c_ner_idxs.to(device) c_iob_idxs = c_iob_idxs.to(device) q_pos_idxs = q_pos_idxs.to(device) q_ner_idxs = q_ner_idxs.to(device) q_iob_idxs = q_iob_idxs.to(device) log_p1, log_p2 = model(cw_idxs, cc_idxs, c_pos_idxs, c_ner_idxs, c_iob_idxs, qw_idxs, qc_idxs, q_pos_idxs, q_ner_idxs, q_iob_idxs) else: raise NameError('No model named ' + model_name) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(NLL=nll_meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) ch_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, ch_vectors=ch_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args.use_squad_v2) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, model_name="", a1=0.5, a2=0.5): meter = util.AverageMeter() # setup losses bceLoss = nn.BCELoss() ceLoss = nn.CrossEntropyLoss() model.eval() pred_dict = {} with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(data_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) # Forward y1, y2 = y1.to(device), y2.to(device) if model_name == 'sketchy': yi = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = bceLoss(yi, torch.where( y1 == 0, 0, 1).type(torch.FloatTensor)) meter.update(loss.item(), batch_size) starts, ends = [[0 if yi[i] == 0 else 1 for i, y in enumerate( y1)], [0 if yi[i] == 0 else 2 for i, y in enumerate(y2)]] elif model_name == 'intensive': yi, log_p1, log_p2 = model( cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = a1 * bceLoss(yi, torch.where(y1 == 0, 0, 1).type( torch.FloatTensor)) + a2 * (F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)) #loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) meter.update(loss.item(), batch_size) # Get F1 and EM scores p1 = log_p1.exp() p2 = log_p2.exp() # print(p1[0,:]) # print(p1) # print(p2[0,:]) # print(p2) starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) starts, ends = starts.tolist(), ends.tolist() elif model_name == 'retro': log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) meter.update(loss.item(), batch_size) p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, max_len, use_squad_v2) starts, ends = starts.tolist(), ends.tolist() else: raise ValueError( 'invalid --model_name, sketchy or intensive required') print("starts: ", starts, "Truth", y1) print("ends: ", ends, "Truth: ", y2) # Log info progress_bar.update(batch_size) progress_bar.set_postfix(loss_calc=meter.avg) preds, _ = util.convert_tokens(gold_dict, ids.tolist(), starts, ends, use_squad_v2) pred_dict.update(preds) model.train() results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2) results_list = [('Loss', meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) return results, pred_dict
def main(args, actions=None): """" actions is a tuple (action, number of actions to be taken) action can be either: "substitute", "delete" or "add". number of actions to be taken: the number of words to apply the "substitute", "delete" or "add" action. """ # check that actions parameters received #print("actions: ",actions) # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # print("*"*80) #print(len(dataset.question_idxs)) #for question_idx in dataset.question_idxs: # print(question_idx) # print("*" * 80) #print(self.question_idxs[question_idx]) #self.question_idxs[idx] # print("data_loader: ",data_loader) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) count_questions_type = defaultdict(lambda: 0) audit_trail_from_question_type = defaultdict(lambda: []) list_of_interrogative_pronouns = [ "what", "whose", "why", "which", "where", "when", "how", "who", "whom" ] for index in range(1, len(gold_dict)): # transform the question in lower case to simplify the analysis, thus losing the benefit of the capital letters # possibly indicating the position of the interrogative pronoun in the sentence. question_lower_case = gold_dict[str(index)]['question'].lower() list_question_lower_case_with_punctuation = question_lower_case.translate( {ord(i): " " for i in "'"}).split() # question_lower_case = [] for item in list_question_lower_case_with_punctuation: question_lower_case.append( item.translate({ord(i): "" for i in ",.<>!@£$%^&*()_-+=?"})) # defining a variable for the first word first_word_question_lower_case = question_lower_case[0] # defining variable for the second word second_word_question_lower_case = question_lower_case[1] # defining variable for the first and second word combined_first_and_second_words = first_word_question_lower_case + " " + second_word_question_lower_case # Analyzing the sentence if first_word_question_lower_case in list_of_interrogative_pronouns: count_questions_type[first_word_question_lower_case] += 1 audit_trail_from_question_type[ first_word_question_lower_case].append(str(index)) # composed question starting by in elif first_word_question_lower_case == "in": if second_word_question_lower_case in list_of_interrogative_pronouns and second_word_question_lower_case != "whose": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) # composed question starting by by elif first_word_question_lower_case == "by": if second_word_question_lower_case in list_of_interrogative_pronouns \ and second_word_question_lower_case != "whom" \ and second_word_question_lower_case != "which" \ and second_word_question_lower_case != "when" \ and second_word_question_lower_case != "how": count_questions_type[combined_first_and_second_words] += 1 audit_trail_from_question_type[ combined_first_and_second_words].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) else: pronoun = find_first_interrogative_pronoun( list_of_interrogative_pronouns, question_lower_case) count_questions_type[pronoun] += 1 audit_trail_from_question_type[pronoun].append(str(index)) reverse_dict_by_value = OrderedDict( sorted(count_questions_type.items(), key=lambda x: x[1])) total_questions = sum(count_questions_type.values()) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward # ********************************************************** # # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> # # Where we make the modif: if actions[0] == "substitute": # substitute to random token in each question of the batch (substitution is made within the same sentence: batch_size = cw_idxs.size()[0] number_of_actions = actions[1] for _ in range(number_of_actions): length_index_batch = cw_idxs.size()[1] for i in range(batch_size): tensor_with_zero_value = (( cw_idxs[i] == 0).nonzero()).squeeze() try: first_zero_value = torch.min( tensor_with_zero_value) except: first_zero_value = length_index_batch if first_zero_value > 2: select_item_idx_1 = random.randint( 0, first_zero_value - 1) select_item_idx_2 = random.randint( 0, first_zero_value - 1) save_value_1 = copy.deepcopy( cw_idxs[i, select_item_idx_1]) cw_idxs[i, select_item_idx_1] = cw_idxs[ i, select_item_idx_2] cw_idxs[i, select_item_idx_2] = save_value_1 elif actions[0] == "delete": # substitute to random token in each question of the batch (substitution is made within the same sentence: batch_size = cw_idxs.size()[0] number_of_actions = actions[1] for _ in range(number_of_actions): length_index_batch = cw_idxs.size()[1] for i in range(batch_size): tensor_with_zero_value = (( cw_idxs[i] == 0).nonzero()).squeeze() try: first_zero_value = torch.min( tensor_with_zero_value) except: first_zero_value = length_index_batch if first_zero_value > 2: print("debug:", i) print("> size before amendment of cw_idxs", cw_idxs.size()) print(cw_idxs[i]) select_item_idx_1 = random.randint( 0, first_zero_value - 1) #print("section 1 :", cw_idxs[i, 0:select_item_idx_1) #print("remove:", cw_idxs[i,0:select_item_idx_1) #print("section 2 :", cw_idxs[i, 0:select_item_idx_1) cw_idxs[i, :] = torch.cat( (cw_idxs[i, 0:select_item_idx_1], cw_idxs[i, select_item_idx_1 + 1:], torch.tensor([0])), -1) print("> size before amendment of cw_idxs", cw_idxs.size()) print(cw_idxs[i]) elif actions[0] == "add": batch_size = cw_idxs.size()[0] number_of_actions = actions[1] for _ in range(number_of_actions): length_index_batch = cw_idxs.size()[1] for i in range(batch_size): tensor_with_zero_value = (( cw_idxs[i] == 0).nonzero()).squeeze() try: first_zero_value = torch.min( tensor_with_zero_value) except: first_zero_value = length_index_batch if first_zero_value > 2: select_item_idx_1 = random.randint( 0, first_zero_value - 1) cw_idxs[i, select_item_idx_1] = random.randint( 1, 50000) elif actions[0] == "add2": # substitute to random token in each question of the batch (substitution is made within the same sentence: batch_size = cw_idxs.size()[0] number_of_actions = actions[1] for _ in range(number_of_actions): length_index_batch = cw_idxs.size()[1] for i in range(batch_size): tensor_with_zero_value = (( cw_idxs[i] == 0).nonzero()).squeeze() try: first_zero_value = torch.min( tensor_with_zero_value) except: first_zero_value = length_index_batch if first_zero_value > 2: select_item_idx_1 = random.randint( 0, first_zero_value - 1) select_item_idx_2 = random.randint( 0, first_zero_value - 1) cw_idxs[i, select_item_idx_1] = random.randint( 1, 50000) cw_idxs[i, select_item_idx_2] = random.randint( 1, 50000) else: print("Incorrect command: exiting") exit() cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward if args.model == 'bidaf': log_p1, log_p2 = model(cw_idxs, qw_idxs) else: log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) # Printing information for questions without interrogative pronouns """" print("len(gold_dict): ", len(gold_dict)) print("len(pred_dict): ", len(pred_dict)) print("Is gold_dict.keys() identical to pred_dict.keys(): ", gold_dict.keys()==pred_dict.keys()) if gold_dict.keys()!=pred_dict.keys(): for key in gold_dict.keys(): if key not in pred_dict.keys(): print("key ", key, " missing in pred_dict.keys(") """ results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Computing the F1 score for each type of question print("for ", actions, ": ", results['F1']) # create a list of the types of questions by extracting the keys from the dict audit_trail_from_question_type # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]]) return results['F1']
def main(args): args.save_dir = util.get_save_dir(args.save_dir, "exp1_training", training=False) log = get_logger(args.logging_dir, "exp1_training") log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) # Get model log.info('Building model...') model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, c.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() # Get data loader log.info('Building dataset...') dataset = SQuAD(args.test_record_file, True) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.datasplit} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission with open(args.test_eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) # Forward log_p1, log_p2 = model(cw_idxs, qw_idxs) y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Get F1 and EM scores p1, p2 = log_p1.exp(), log_p2.exp() starts, ends = util.discretize(p1, p2, c.max_ans_len, True) # Log info progress_bar.update(batch_size) # Not using the unlabeled test set # if args.split != 'test': # # No labels for the test set, so NLL would be invalid # progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), True) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) results = util.eval_dicts(gold_dict, pred_dict, True) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.datasplit} {results_str}') # Log to TensorBoard tbx = SummaryWriter(c.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.test_eval_file, step=0, split=args.datasplit, num_visuals=args.num_visuals)
CM = 0.01 RA = 10 Em = -65e-3 pulse_dur = 100e-3 pulse_amp = 0.1e-9 pulse_delay = 50e-3 # Create a neutral directory for the neuron neuron = moose.Neutral('/neuron') # Create the soma under the neuron directory soma = u.createCompartment(neuron, 'swagginSoma', soma_l, soma_rad, RM, CM, RA, Em) # Create the dendritic compartments under the neuron directory dend_branch = u.discretize(neuron, numDends, dend_l, dend_rad, RM, CM, RA, Em) # Connect the soma to the first element of the dendritic branch moose.connect(soma, 'axialOut', dend_branch[0], 'handleAxial') # Create a pulse and connect it to the soma soma_pulse = u.createPulse(soma, 'rollingWave', pulse_dur, pulse_amp, pulse_delay) # Create data tables to store the voltage for the soma and each compartment # making up the dendritic branch data = moose.Neutral('/data') soma_Vm = u.createDataTables(soma, data) dend_tables = [] for dend in dend_branch: dend_tables.append(u.createDataTables(dend, data))
def main(args): # Set up logging args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) device, gpu_ids = util.get_available_devices() args.batch_size *= max(1, len(gpu_ids)) log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) models = {} if args.use_ensemble: total_models = 0 for model_name in ['bidaf', 'bidafextra', 'fusionnet']: models_list = [] for model_file in glob.glob( f'{args.load_path}/{model_name}-*/{args.ensemble_models}'): # Get model log.info('Building model...') if model_name == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif model_name == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif model_name == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {model_file}...') model = util.load_model(model, model_file, gpu_ids, return_step=False) # Load each model on CPU (have plenty of RAM ...) model = model.cpu() model.eval() models_list.append(model) models[model_name] = models_list total_models += len(models_list) log.info(f'Using an ensemble of {total_models} models') else: device, gpu_ids = util.get_available_devices() # Get model log.info('Building model...') if args.model == 'bidaf': model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size) elif args.model == 'bidafextra': model = BiDAFExtra(word_vectors=word_vectors, args=args) elif args.model == 'fusionnet': model = FusionNet(word_vectors=word_vectors, args=args) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() models[args.model] = [model] # Get data loader log.info('Building dataset...') record_file = vars(args)[f'{args.split}_record_file'] dataset = SQuAD(record_file, args) data_loader = data.DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Evaluate log.info(f'Evaluating on {args.split} split...') nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = vars(args)[f'{args.split}_eval_file'] with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader: # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) batch_size = cw_idxs.size(0) p1s = [] p2s = [] for model_name in models: for model in models[model_name]: # Move model to GPU to evaluate model = model.to(device) # Forward if model_name == 'bidaf': log_p1, log_p2 = model.to(device)(cw_idxs, qw_idxs) else: log_p1, log_p2 = model.to(device)(cw_idxs, qw_idxs, cw_pos, cw_ner, cw_freq, cqw_extra) log_p1, log_p2 = log_p1.cpu(), log_p2.cpu() if not args.use_ensemble: y1, y2 = y1.to(device), y2.to(device) log_p1, log_p2 = log_p1.to(device), log_p2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) nll_meter.update(loss.item(), batch_size) # Move model back to CPU to release GPU memory model = model.cpu() # Get F1 and EM scores p1, p2 = log_p1.exp().unsqueeze( -1).cpu(), log_p2.exp().unsqueeze(-1).cpu() p1s.append(p1), p2s.append(p2) best_ps = torch.max( torch.cat([ torch.cat(p1s, -1).unsqueeze(-1), torch.cat(p2s, -1).unsqueeze(-1) ], -1), -2)[0] p1, p2 = best_ps[:, :, 0], best_ps[:, :, 1] starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(batch_size) if args.split != 'test': # No labels for the test set, so NLL would be invalid progress_bar.set_postfix(NLL=nll_meter.avg) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])
def main(args): # Set up args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False) log = util.get_logger(args.save_dir, args.name) device, gpu_ids = util.get_available_devices() # Get embeddings log.info('Loading embeddings...') with open('data/meta.msgpack', 'rb') as f: meta = msgpack.load(f, encoding='utf8') embedding = torch.Tensor(meta['embedding']) opt = vars(args) opt['pretrained_words'] = True opt['vocab_size'] = embedding.size(0) opt['embedding_dim'] = embedding.size(1) opt['pos_size'] = len(meta['vocab_tag']) opt['ner_size'] = len(meta['vocab_ent']) # Get model log.info('Building model...') model = DRQA(opt, embedding=embedding) model = nn.DataParallel(model, gpu_ids) log.info(f'Loading checkpoint from {args.load_path}...') model = util.load_model(model, args.load_path, gpu_ids, return_step=False) model = model.to(device) model.eval() BatchGen.pos_size = opt['pos_size'] BatchGen.ner_size = opt['ner_size'] # Get data loader log.info('Building dataset...') with open(opt['data_file'], 'rb') as f: data = msgpack.load(f, encoding='utf8') test = data['test'] # Evaluate log.info(f'Evaluating on test split...') batches = BatchGen(test, batch_size=args.batch_size, evaluation=True, is_test=True, gpu=args.cuda) nll_meter = util.AverageMeter() pred_dict = {} # Predictions for TensorBoard sub_dict = {} # Predictions for submission eval_file = './data/test_eval.json' with open(eval_file, 'r') as fh: gold_dict = json_load(fh) with torch.no_grad(), \ tqdm(total=len(test)) as progress_bar: model.eval() for i, batch in enumerate(batches): # Setup for forward inputs = [e.to(device) for e in batch[:7]] ids = batch[-1] # Forward with torch.no_grad(): score_s, score_e = model(*inputs) p1, p2 = score_s, score_e starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2) # Log info progress_bar.update(args.batch_size) idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(), starts.tolist(), ends.tolist(), args.use_squad_v2) pred_dict.update(idx2pred) sub_dict.update(uuid2pred) # Log results (except for test set, since it does not come with labels) if args.split != 'test': results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2) results_list = [('NLL', nll_meter.avg), ('F1', results['F1']), ('EM', results['EM'])] if args.use_squad_v2: results_list.append(('AvNA', results['AvNA'])) results = OrderedDict(results_list) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'{args.split.title()} {results_str}') # Log to TensorBoard tbx = SummaryWriter(args.save_dir) util.visualize(tbx, pred_dict=pred_dict, eval_path=eval_file, step=0, split=args.split, num_visuals=args.num_visuals) # Write submission file sub_path = join(args.save_dir, args.split + '_' + args.sub_file) log.info(f'Writing submission file to {sub_path}...') with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh: csv_writer = csv.writer(csv_fh, delimiter=',') csv_writer.writerow(['Id', 'Predicted']) for uuid in sorted(sub_dict): csv_writer.writerow([uuid, sub_dict[uuid]])