예제 #1
0
 def forward(self,
             sketchy_prediction,
             intensive_prediction,
             log_p1,
             log_p2,
             max_len=15,
             use_squad_v2=True):
     s_in = log_p1.exp()
     e_in = log_p2.exp()
     starts, ends = discretize(s_in, e_in, max_len, use_squad_v2)
     # Combines answerability estimate from both the sketchy and intensive models
     pred_answerable = self.beta * intensive_prediction + \
         (1-self.beta) * sketchy_prediction
     # Calcultes how certain we are of intesives prediction
     has = torch.tensor([
         s_in[x, starts[x]] * e_in[x, ends[x]] for x in range(s_in.shape[0])
     ]).to(device='cuda:0')
     null = (s_in[:, 0] * e_in[:, 0]).to(device='cuda:0')
     span_answerable = has - null
     # Combines our answerability with our certainty
     answerable = self.lam * pred_answerable + \
         (1 - self.lam) * span_answerable
     l_p1 = log_p1.clone()
     l_p2 = log_p2.clone()
     l_p1[answerable <= self.ans] = 0
     l_p2[answerable <= self.ans] = 0
     return l_p1, l_p2
예제 #2
0
def evaluate(args, model, data_loader, device, eval_file, max_len,
             use_squad_v2):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
         tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader:

            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            cw_pos = cw_pos.to(device)
            cw_ner = cw_ner.to(device)
            cw_freq = cw_freq.to(device)
            cqw_extra = cqw_extra.to(device)

            batch_size = cw_idxs.size(0)

            # Forward
            if args.model == 'bidaf':
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
            else:
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner,
                                       cw_freq, cqw_extra)

            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #3
0
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2,
             args):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:

            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            ## Additions for BERT ##
            max_context_len, max_question_len = args.para_limit, args.ques_limit
            if "bert" in args.model_type:
                bert_dev_embeddings = get_embeddings("dev", ids,
                                                     args.para_limit,
                                                     args.ques_limit)
            else:
                bert_dev_embeddings = None

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_dev_embeddings, \
            max_context_len, max_question_len, device)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #4
0
파일: mytest.py 프로젝트: JNXSTJ/squad
def test1():
    batch = 1
    c_len = 6

    start = torch.randn(batch, c_len)
    end = torch.randn(batch, c_len)
    start = torch.softmax(start, -1)
    end = torch.softmax(end, -1)
    start_ids, end_ids = util.discretize(start, end, max_len = 3,  no_answer = True)
    return start_ids, end_ids
예제 #5
0
def get_reward_for_arm(row, arm):

    true_arm = util.discretize(float(row['Therapeutic Dose of Warfarin']))

    reward = 0 if arm - 1 == true_arm else -1

    log.debug(
        'ID {} - Arm_{} Therapeutic Dose of Warfarin = {}|{}  => {}'.format(
            row['PharmGKB Subject ID'], arm,
            row['Therapeutic Dose of Warfarin'], true_arm, reward))

    return reward
예제 #6
0
파일: train.py 프로젝트: devansh287/squad
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2):
    nll_meter = util.AverageMeter()
    # print('Memory 3: ', torch.cuda.memory_allocated())
    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # print('Memory at start of loop section: ', torch.cuda.memory_allocated())
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            cc_idxs = cc_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            qc_idxs = qc_idxs.to(device)
            batch_size = cw_idxs.size(0)
            # print('Memory Before Forward Pass: '******'Memory After Forward Pass: '******'Memory After Loss Calc: ', torch.cuda.memory_allocated())
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)
            print('Max start idx score: ', torch.max(starts))
            print('Max End idx score: ', torch.max(ends))

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #7
0
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2):
    nll_meter = util.AverageMeter()
    loss_f = torch.nn.CrossEntropyLoss()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    mems = (tuple(), tuple(), tuple())
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            cc_idxs = cc_idxs.to(device)
            qc_idxs = qc_idxs.to(device)

            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2, mems = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs,
                                         *mems)
            y1, y2 = y1.to(device), y2.to(device)
            loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2))
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #8
0
def AverageProbs(args, p_start, p_end, id_quest, gold_dict):
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    nbr_model = len(p_start)
    nbr_batch = len(p_start[0])
    for j in range(nbr_batch):
        p1 = np.zeros(p_start[0][j].shape)
        p2 = np.zeros(p_start[0][j].shape)
        for model in range(nbr_model):
            p1 += p_start[model][j]
            p2 += p_end[model][j]

        p1 /= nbr_model
        p2 /= nbr_model
        ids = id_quest[j]

        p1 = torch.from_numpy(p1).float().cuda()
        p2 = torch.from_numpy(p2).float().cuda()
        ids = ids.cuda()

        starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                       args.use_squad_v2)

        idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                  starts.tolist(),
                                                  ends.tolist(),
                                                  args.use_squad_v2)
        pred_dict.update(idx2pred)
        sub_dict.update(uuid2pred)

    results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
    results_list = [('F1', results['F1']), ('EM', results['EM'])]
    if args.use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    # Log to console
    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
    print(f'{args.split.title()} {results_str}')

    # Write submission file
    sub_path = join("../save/test/" + split + "/submission.csv")
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
예제 #9
0
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2): # use_squad_v2 = True
    nll_meter = util.AverageMeter() #  Keep track of average values over time.

    model.eval() # Sets the module in evaluation mode.
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:  # ids
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)   ##

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()  # e^log_p1  e^log_p2
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)  ## return :  start_idxs &  end_idxs

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict,    ## Convert predictions to tokens from the context.
                                           ids.tolist(),
                                           starts.tolist(),
                                           ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)  ##
    results_list = [('NLL', nll_meter.avg),
                    ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #10
0
    def compute_arm_index(self, data):
        dose_inputs = torch.cat([
            torch.ones(
                (data.size(0), 1), dtype=torch.float, device=data.device),
            data[:, list(column_mapping.values())]
        ], 1)
        dose_inputs.to(data.device)

        dose_params_tensor = torch.FloatTensor(self.dose_params).unsqueeze(-1)
        dose_params_tensor = dose_params_tensor.to(data.device)

        dose = torch.matmul(dose_inputs, dose_params_tensor)

        dose = dose**2

        arm_indexes = torch.LongTensor(util.discretize(dose.cpu()))
        arm_indexes = arm_indexes.to(data.device)

        return arm_indexes
예제 #11
0
    def process_sample(sample, model, gold_dict=None):
        cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids = sample
        batch_size = cw_idxs.size(0)
        log_p1, log_p2 = model(cw_idxs.to(device), cc_idxs.to(device),
                               qw_idxs.to(device), qc_idxs.to(device))
        y1, y2 = y1.to(device), y2.to(device)
        nll_loss_1 = F.nll_loss(log_p1, y1)
        nll_loss_2 = F.nll_loss(log_p2, y2)
        loss = nll_loss_1 + nll_loss_2
        preds = None
        if gold_dict:
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)
            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           args.use_squad_v2)

        return loss, batch_size, preds
예제 #12
0
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
        # Setup for forward
        cw_idxs = cw_idxs.to(device)
        cc_idxs = cc_idxs.to(device)
        qw_idxs = qw_idxs.to(device)
        qc_idxs = qc_idxs.to(device)
        batch_size = cc_idxs.size(0)

        # Forward
        log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
        y1, y2 = y1.to(device), y2.to(device)
        loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
        nll_meter.update(loss.item(), batch_size)

        # Get F1 and EM scores
        p1, p2 = log_p1.exp(), log_p2.exp()
        starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

        preds, _ = util.convert_tokens(gold_dict,
                                       ids.tolist(), starts.tolist(),
                                       ends.tolist(), use_squad_v2)
        pred_dict.update(preds)

    # Log info
    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #13
0
def WeightedAverage(args, p_start, p_end, weigths, id_quest, gold_dict):
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    nbr_model = len(p_start)
    nbr_batch = len(p_start[0])
    for j in range(nbr_batch):
        p1 = np.zeros(p_start[0][j].shape)
        p2 = np.zeros(p_start[0][j].shape)
        for model in range(nbr_model):
            p1 += weigths[model] * p_start[model][j]
            p2 += weigths[model] * p_end[model][j]

        ids = id_quest[j]

        p1 = torch.from_numpy(p1).float().cuda()
        p2 = torch.from_numpy(p2).float().cuda()
        ids = ids.cuda()

        starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                       args.use_squad_v2)

        idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                  starts.tolist(),
                                                  ends.tolist(),
                                                  args.use_squad_v2)
        pred_dict.update(idx2pred)
        sub_dict.update(uuid2pred)

    results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
    results_list = [('F1', results['F1']), ('EM', results['EM'])]
    if args.use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    # Log to console
    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
    print(f'{args.split.title()} {results_str}')
예제 #14
0
파일: train.py 프로젝트: qwang70/squad
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2,
             enable_EM, enable_posner):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids, cwf, lemma_indicators, c_posner, q_posner in data_loader:
            # Setup for forward
            cc_idxs = cc_idxs.to(device)
            qc_idxs = qc_idxs.to(device)
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            if enable_EM:
                cwf = cwf.to(device)
                #qwf = qwf.to(device)
                lemma_indicators = lemma_indicators.to(device)
            else:
                cwf = None
                lemma_indicators = None
            if enable_posner:
                c_posner = c_posner.to(device)
                q_posner = q_posner.to(device)
            else:
                q_posner = None
                c_posner = None
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs, cwf,
                                   lemma_indicators, c_posner, q_posner)
            y1, y2 = y1.to(device), y2.to(device)
            weight = torch.ones(log_p1.size(1), device=device)
            weight[0] = 5
            loss = F.nll_loss(log_p1, y1, weight=weight) + F.nll_loss(
                log_p2, y2, weight=weight)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #15
0
def main(args):
    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    #log = util.get_logger(args.save_dir, args.name)
    #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    print('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    print('Building model...')
    model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size)
    model = nn.DataParallel(model, gpu_ids)
    print(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    print('Building dataset...')
    #record_file = vars(args)[f'{args.split}_record_file']
    dataset = SQuAD("./data/my_test.npz", args.use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    print(f'Evaluating on {args.split} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    #eval_file = vars(args)[f'{args.split}_eval_file']
    with open("./data/my_test_eval.json", 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            print("viewing the dataset")
            print(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)
            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            #if args.split != 'test':
            # No labels for the test set, so NLL would be invalid
            #progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    print("my evaluation ....")

    for el in pred_dict:
        print(el, pred_dict[el])

    for el in sub_dict:
        print(el, sub_dict[el])
예제 #16
0
def main(args):
    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    if args.model == 'bidaf':
        model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size)
    elif args.model == 'bidafextra':
        model = BiDAFExtra(word_vectors=word_vectors, args=args)
    elif args.model == 'fusionnet':
        model = FusionNet(word_vectors=word_vectors, args=args)

    model = nn.DataParallel(model, gpu_ids)
    log.info(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)[f'{args.split}_record_file']
    dataset = SQuAD(record_file, args)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)
    # print("*"*80)
    # print(len(dataset.question_idxs))

    # for question_idx in dataset.question_idxs:
    #    print(question_idx)
    #    print("*" * 80)

    # print(self.question_idxs[question_idx])
    # self.question_idxs[idx]
    # print("data_loader: ",data_loader)
    # Evaluate
    log.info(f'Evaluating on {args.split} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f'{args.split}_eval_file']
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)

    # create statistics
    # print("*"*80)
    # print(len(gold_dict))
    # print(gold_dict['1']['question'])

    count_questions_type = defaultdict(lambda: 0)

    audit_trail_from_question_type = defaultdict(lambda: [])
    list_of_interrogative_pronouns = [
        "what", "whose", "why", "which", "where", "when", "how", "who", "whom"
    ]

    for index in range(1, len(gold_dict)):
        # transform the question in lower case to simplify the analysis, thus losing the benefit of the capital letters
        # possibly indicating the position of the interrogative pronoun in the sentence.
        question_lower_case = gold_dict[str(index)]['question'].lower()

        list_question_lower_case_with_punctuation = question_lower_case.translate(
            {ord(i): " "
             for i in "'"}).split()

        #
        question_lower_case = []
        for item in list_question_lower_case_with_punctuation:
            question_lower_case.append(
                item.translate({ord(i): ""
                                for i in ",.<>!@£$%^&*()_-+=?"}))

        # defining a variable for the first word
        first_word_question_lower_case = question_lower_case[0]

        # defining variable for the second word
        second_word_question_lower_case = question_lower_case[1]

        # defining variable for the first and second word
        combined_first_and_second_words = first_word_question_lower_case + " " + second_word_question_lower_case

        #printing on the screen test for debugging purpose

        # Analyzing the sentence
        if first_word_question_lower_case in list_of_interrogative_pronouns:
            count_questions_type[first_word_question_lower_case] += 1
            audit_trail_from_question_type[
                first_word_question_lower_case].append(str(index))
        # composed question starting by in
        elif first_word_question_lower_case == "in":
            if second_word_question_lower_case in list_of_interrogative_pronouns and second_word_question_lower_case != "whose":
                count_questions_type[combined_first_and_second_words] += 1
                audit_trail_from_question_type[
                    combined_first_and_second_words].append(str(index))
            else:
                pronoun = find_first_interrogative_pronoun(
                    list_of_interrogative_pronouns, question_lower_case)
                count_questions_type[pronoun] += 1
                audit_trail_from_question_type[pronoun].append(str(index))

        # composed question starting by by
        elif first_word_question_lower_case == "by":
            if second_word_question_lower_case in list_of_interrogative_pronouns \
                    and second_word_question_lower_case !="whom"\
                    and second_word_question_lower_case !="which"\
                    and second_word_question_lower_case !="when"\
                    and second_word_question_lower_case !="how":
                count_questions_type[combined_first_and_second_words] += 1
                audit_trail_from_question_type[
                    combined_first_and_second_words].append(str(index))
            else:
                pronoun = find_first_interrogative_pronoun(
                    list_of_interrogative_pronouns, question_lower_case)
                count_questions_type[pronoun] += 1
                audit_trail_from_question_type[pronoun].append(str(index))

        else:
            pronoun = find_first_interrogative_pronoun(
                list_of_interrogative_pronouns, question_lower_case)
            #if pronoun =="":
            #    print(">>", question_lower_case)
            #    print("@@@", gold_dict[str(index)]['question'])
            count_questions_type[pronoun] += 1
            audit_trail_from_question_type[pronoun].append(str(index))
            # if pronoun =="":
            #    print(">>", question_lower_case.split())
            # print()
            #if first_word_question_lower_case == "if":
            #    print(">>", question_lower_case.split())

    # print(count_questions_type)
    # if gold_dict[str(index)]['question'].lower().split()[0] == "in":
    #    print(gold_dict[str(index)]['question'])

    reverse_dict_by_value = OrderedDict(
        sorted(count_questions_type.items(), key=lambda x: x[1]))
    # print(count_questions_type)
    total_questions = sum(count_questions_type.values())
    # print(reverse_dict)
    #for k, v in reverse_dict_by_value.items():
    #   print( "%s: %s and in percentage: %s" % (k, v, 100*v/total_questions))
    #print(audit_trail_from_question_type)
    # exit()
    with torch.no_grad(), \
         tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            if args.model == 'bidaf':
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
            else:
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner,
                                       cw_freq, cqw_extra)

            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)

        # Printing information for questions without interrogative pronouns
        """"
        print("len(gold_dict): ", len(gold_dict))
        print("len(pred_dict): ", len(pred_dict))
        print("Is gold_dict.keys() identical to pred_dict.keys(): ", gold_dict.keys()==pred_dict.keys())
        if gold_dict.keys()!=pred_dict.keys():
            for key in gold_dict.keys():
                if key not in pred_dict.keys():
                    print("key ", key, " missing in pred_dict.keys(")
        """
        results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Computing the F1 score for each type of question
        #
        #    audit_trail_from_question_type[pronoun].append(str(index))

        # create a list of the types of questions by extracting the keys from the dict audit_trail_from_question_type
        types_of_questions = list(audit_trail_from_question_type.keys())

        gold_dict_per_type_of_questions = defaultdict(lambda: [])
        pred_dict_per_type_of_questions = {}

        gold_dict_per_type_of_questions_start = {}
        pred_dict_per_type_of_questions_start = {}

        gold_dict_per_type_of_questions_middle = {}
        pred_dict_per_type_of_questions_middle = {}

        gold_dict_per_type_of_questions_end = {}
        pred_dict_per_type_of_questions_end = {}

        for type_of_questions in types_of_questions:
            #gold_pred = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions]}
            #lst_pred = {key: value for key, value in pred_dict.items() if key in audit_trail_from_question_type[type_of_questions]}

            # Create two dictionnaries for each type of sentence for gold_dict_per_type_of_questions and pred_dict_per_type_of_questions
            gold_dict_per_type_of_questions[type_of_questions] = {
                key: value
                for key, value in gold_dict.items()
                if key in audit_trail_from_question_type[type_of_questions]
                and key in pred_dict.keys()
            }
            pred_dict_per_type_of_questions[type_of_questions] = {
                key: value
                for key, value in pred_dict.items()
                if key in audit_trail_from_question_type[type_of_questions]
                and key in pred_dict.keys()
            }
            # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1'])

            gold_dict_per_type_of_questions_start[type_of_questions] = {
                key: value
                for key, value in gold_dict.items()
                if key in audit_trail_from_question_type[type_of_questions]
                and key in pred_dict.keys()
            }
            pred_dict_per_type_of_questions_start[type_of_questions] = {
                key: value
                for key, value in pred_dict.items()
                if key in audit_trail_from_question_type[type_of_questions]
                and key in pred_dict.keys()
            }

            gold_dict_per_type_of_questions_middle[type_of_questions] = {
                key: value
                for key, value in gold_dict.items()
                if key in audit_trail_from_question_type[type_of_questions]
                and key in pred_dict.keys()
            }
            pred_dict_per_type_of_questions_middle[type_of_questions] = {
                key: value
                for key, value in pred_dict.items()
                if key in audit_trail_from_question_type[type_of_questions]
                and key in pred_dict.keys()
            }

            gold_dict_per_type_of_questions_end[type_of_questions] = {
                key: value
                for key, value in gold_dict.items()
                if key in audit_trail_from_question_type[type_of_questions]
                and key in pred_dict.keys()
            }
            pred_dict_per_type_of_questions_end[type_of_questions] = {
                key: value
                for key, value in pred_dict.items()
                if key in audit_trail_from_question_type[type_of_questions]
                and key in pred_dict.keys()
            }

            for key, value in gold_dict.items():
                #if key in audit_trail_from_question_type[type_of_questions] and key in pred_dict.keys():
                if key in audit_trail_from_question_type[
                        type_of_questions] and type_of_questions != "" and key in pred_dict_per_type_of_questions[
                            type_of_questions]:
                    """
                    print("type_of_questions: ",type_of_questions)
                    print("key: ", key)
                    print("question: ", value["question"])
                    sub_index = value["question"].lower().find(type_of_questions)
                    print("sub_index: ",sub_index)
                    test_fc = value["question"].lower().find(type_of_questions)
                    print("present type of the var: ",type(test_fc))
                    #print("question: ", value["question"][str(key)])
                    print("length of the question: ", len(value["question"]))
                    print('Position of the interrogative pronoun in the question:', )
                    """
                    # Create two dictionnaries for each type of sentence based at the start of the sentence

                    if value["question"].lower().find(
                            type_of_questions) == 1 or value["question"].lower(
                            ).find(type_of_questions) == 0:
                        #print("BEGINNING")
                        if type_of_questions != "":
                            try:
                                del gold_dict_per_type_of_questions_middle[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del pred_dict_per_type_of_questions_middle[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del gold_dict_per_type_of_questions_end[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del pred_dict_per_type_of_questions_end[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                        #pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in
                        #                                                            gold_dict_per_type_of_questions_start[
                        #                                                                type_of_questions].keys()}
                    elif value["question"].lower(
                    ).find(type_of_questions) >= len(
                            value["question"]) - len(type_of_questions) - 5:
                        #print("END")

                        if type_of_questions != "":
                            try:
                                del gold_dict_per_type_of_questions_middle[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del pred_dict_per_type_of_questions_middle[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del gold_dict_per_type_of_questions_start[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del pred_dict_per_type_of_questions_start[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                        #print("type_of_questions: ",type_of_questions)
                        #sub_index = value["question"].lower().find(type_of_questions)
                        #print("sub_index: ", sub_index)
                        #print("len(value['question']) - len(type_of_questions) - 2: ", len(value["question"])-len(type_of_questions)-2)
                        #start_string = len(value["question"])-len(type_of_questions)-6
                        #end_string = len(value["question"])-1
                        #print("extract at the end: ", value["question"][start_string:end_string])
                    else:
                        #print("MIDDLE")
                        if type_of_questions != "":
                            try:
                                del gold_dict_per_type_of_questions_start[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del pred_dict_per_type_of_questions_start[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del gold_dict_per_type_of_questions_end[
                                    type_of_questions][key]
                            except KeyError:
                                pass

                            try:
                                del pred_dict_per_type_of_questions_end[
                                    type_of_questions][key]
                            except KeyError:
                                pass
                            pass
            """
            if  type_of_questions != "":
                gold_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in gold_dict.items() if (key in audit_trail_from_question_type[type_of_questions] \
                                                                        and (value["question"].lower().find(type_of_questions) <= 1) \
                                                                        and key in pred_dict_per_type_of_questions[type_of_questions]) }
            """
            """
                for key in gold_dict_per_type_of_questions_start[type_of_questions].keys():
                    print("key:: ", key )
                    print("type(key):: ", type(key) )
                            print("pred_dict[,key,] : ", pred_dict[key])
                print("@@@@@@@@@@@@@@@@@@@@@@@@")
                
                pred_dict_per_type_of_questions_start[type_of_questions] = {key: pred_dict[key] for key in gold_dict_per_type_of_questions_start[type_of_questions].keys()}

                #pred_dict_per_type_of_questions_start[type_of_questions] = {key: value for key, value in pred_dict.items() if key in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) }

                # Create two dictionnaries for each type of sentence based at the end of the sentence
                gold_dict_per_type_of_questions_end[type_of_questions] = {key: value for key, value in gold_dict.items() if key in audit_trail_from_question_type[type_of_questions] \
                                                                        and value["question"].lower().find(type_of_questions) >= len(value["question"])-len(type_of_questions)-2 \
                                                                        and key in pred_dict_per_type_of_questions[type_of_questions]}


                pred_dict_per_type_of_questions_end[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())}

                #print("*"*80)
                # Create two dictionnaries for each type of sentence based at the middle of the sentencecount_questions_type
                gold_dict_per_type_of_questions_middle[type_of_questions] = {key: value for key, value in gold_dict.items() if key not in list(gold_dict_per_type_of_questions_start[type_of_questions].keys()) \
                                                                            and key not in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())}

                pred_dict_per_type_of_questions_middle[type_of_questions] = {key: pred_dict[key] for key in list(gold_dict_per_type_of_questions_end[type_of_questions].keys())}
            else:
                gold_dict_per_type_of_questions_start[""] = gold_dict_per_type_of_questions[""]
                pred_dict_per_type_of_questions_start[""] = pred_dict_per_type_of_questions[""]
                gold_dict_per_type_of_questions_end[""] = gold_dict_per_type_of_questions[""]
                pred_dict_per_type_of_questions_end[""] = pred_dict_per_type_of_questions[""]
                gold_dict_per_type_of_questions_middle[""] = gold_dict_per_type_of_questions[""]
                pred_dict_per_type_of_questions_middle[""] = pred_dict_per_type_of_questions[""]
        """

        positions_in_question = ["beginning", "middle", "end"]

        # print(type_of_questions," F1 score: ", util.eval_dicts(gold_dict_per_type_of_questions[type_of_questions], pred_dict_per_type_of_questions[type_of_questions], args.use_squad_v2)['F1'])

        list_beginning = [
            util.eval_dicts(
                gold_dict_per_type_of_questions_start[type_of_questions],
                pred_dict_per_type_of_questions_start[type_of_questions],
                args.use_squad_v2)['F1']
            for type_of_questions in types_of_questions
        ]
        list_middle = [
            util.eval_dicts(
                gold_dict_per_type_of_questions_middle[type_of_questions],
                pred_dict_per_type_of_questions_middle[type_of_questions],
                args.use_squad_v2)['F1']
            for type_of_questions in types_of_questions
        ]
        list_end = [
            util.eval_dicts(
                gold_dict_per_type_of_questions_end[type_of_questions],
                pred_dict_per_type_of_questions_end[type_of_questions],
                args.use_squad_v2)['F1']
            for type_of_questions in types_of_questions
        ]

        #for type_of_questions in types_of_questions:
        #    print("gold_dict_per_type_of_questions_start[type_of_questions]: ",gold_dict_per_type_of_questions_start[type_of_questions])
        #    print("pred_dict_per_type_of_questions[type_of_questions]: ",pred_dict_per_type_of_questions[type_of_questions])

        F1 = np.array([list_beginning, list_middle, list_end])

        m, n = F1.shape

        value_to_ignore = []
        for i in range(m):
            for j in range(n):
                if F1[i, j] == "NA" or F1[i, j] == 0:
                    value_to_ignore.append((i, j))
        print("value to ignore: ", value_to_ignore)
        #F1 = np.array([[0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0],
        #                    [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0],
        #                    [0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0, 0, 0]])

        data_label = copy.deepcopy(F1)

        for row in data_label:
            for column_idx in range(len(row)):
                if row[column_idx] == "NA":
                    row[column_idx] = ""

        # print question without interrogative pronoun required for the second part of the analysis:
        for key, value in gold_dict.items():
            if key in audit_trail_from_question_type[
                    ""] and key in pred_dict.keys():
                print("question: ", gold_dict_per_type_of_questions[''])
                print("golden answers: ", )
                print("prediction: ", pred_dict[key])
                print()

        fig, ax = plt.subplots()

        types_of_questions[types_of_questions.index(
            "")] = "Implicit question without interrogative pronoun"

        im, cbar = heatmap(F1, positions_in_question, types_of_questions, ax=ax, \
                            cmap="YlGn", cbarlabel="F1 scores")

        texts = annotate_heatmap(im,
                                 data=data_label,
                                 valfmt="{x:.1f}",
                                 ignore=value_to_ignore)

        fig.tight_layout()
        plt.show()

        # Log to console
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
        log.info(f'{args.split.title()} {results_str}')

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info(f'Writing submission file to {sub_path}...')
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)

    if use_syll:
        with open('data/word_idx2syll_idx.json') as json_file:  
            word_idx2syll_idx = json.load(json_file)

    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            batch_size = cw_idxs.size(0)

            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)

            if use_char:
                # prepare character indices
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)

            if use_syll:
                # convert word index to syllable index
                cs_idxs = word2syll_idxs(cw_idxs, word_idx2syll_idx).to(device)
                qs_idxs = word2syll_idxs(qw_idxs, word_idx2syll_idx).to(device)

            if use_char and use_syll:
                log_p1, log_p2 = model(cw_idxs=cw_idxs, qw_idxs=qw_idxs, 
                                       cc_idxs=cc_idxs, qc_idxs=qc_idxs, 
                                       cs_idxs=cs_idxs, qs_idxs=qs_idxs)
            elif use_char:
                log_p1, log_p2 = model(cw_idxs=cw_idxs, qw_idxs=qw_idxs, 
                                       cc_idxs=cc_idxs, qc_idxs=qc_idxs)
            elif use_syll:
                log_p1, log_p2 = model(cw_idxs=cw_idxs, qw_idxs=qw_idxs, 
                                       cs_idxs=cs_idxs, qs_idxs=qs_idxs)
            else:
                log_p1, log_p2 = model(cw_idxs=cw_idxs, qw_idxs=qw_idxs)

            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict,
                                           ids.tolist(),
                                           starts.tolist(),
                                           ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg),
                    ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #18
0
def main(args):
    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)
    # Get model
    log.info('Building model...')
    nbr_model = 0
    if (args.load_path_baseline):
        model_baseline = Baseline(word_vectors=word_vectors, hidden_size=100)
        model_baseline = nn.DataParallel(model_baseline, gpu_ids)
        log.info(f'Loading checkpoint from {args.load_path_baseline}...')
        model_baseline = util.load_model(model_baseline,
                                         args.load_path_baseline,
                                         gpu_ids,
                                         return_step=False)
        model_baseline = model_baseline.to(device)
        model_baseline.eval()
        nll_meter_baseline = util.AverageMeter()
        nbr_model += 1
        save_prob_baseline_start = []
        save_prob_baseline_end = []

    if (args.load_path_bidaf):
        model_bidaf = BiDAF(word_vectors=word_vectors,
                            char_vectors=char_vectors,
                            char_emb_dim=args.char_emb_dim,
                            hidden_size=args.hidden_size)
        model_bidaf = nn.DataParallel(model_bidaf, gpu_ids)
        log.info(f'Loading checkpoint from {args.load_path_bidaf}...')
        model_bidaf = util.load_model(model_bidaf,
                                      args.load_path_bidaf,
                                      gpu_ids,
                                      return_step=False)
        model_bidaf = model_bidaf.to(device)
        model_bidaf.eval()
        nll_meter_bidaf = util.AverageMeter()
        nbr_model += 1
        save_prob_bidaf_start = []
        save_prob_bidaf_end = []

    if (args.load_path_bidaf_fusion):
        model_bidaf_fu = BiDAF_fus(word_vectors=word_vectors,
                                   char_vectors=char_vectors,
                                   char_emb_dim=args.char_emb_dim,
                                   hidden_size=args.hidden_size)
        model_bidaf_fu = nn.DataParallel(model_bidaf_fu, gpu_ids)
        log.info(f'Loading checkpoint from {args.load_path_bidaf_fusion}...')
        model_bidaf_fu = util.load_model(model_bidaf_fu,
                                         args.load_path_bidaf_fusion,
                                         gpu_ids,
                                         return_step=False)
        model_bidaf_fu = model_bidaf_fu.to(device)
        model_bidaf_fu.eval()
        nll_meter_bidaf_fu = util.AverageMeter()
        nbr_model += 1
        save_prob_bidaf_fu_start = []
        save_prob_bidaf_fu_end = []

    if (args.load_path_qanet):
        model_qanet = QANet(word_vectors=word_vectors,
                            char_vectors=char_vectors,
                            char_emb_dim=args.char_emb_dim,
                            hidden_size=args.hidden_size,
                            n_heads=args.n_heads,
                            n_conv_emb_enc=args.n_conv_emb,
                            n_conv_mod_enc=args.n_conv_mod,
                            n_emb_enc_blocks=args.n_emb_blocks,
                            n_mod_enc_blocks=args.n_mod_blocks,
                            divisor_dim_kqv=args.divisor_dim_kqv)

        model_qanet = nn.DataParallel(model_qanet, gpu_ids)
        log.info(f'Loading checkpoint from {args.load_path_qanet}...')
        model_qanet = util.load_model(model_qanet,
                                      args.load_path_qanet,
                                      gpu_ids,
                                      return_step=False)
        model_qanet = model_qanet.to(device)
        model_qanet.eval()
        nll_meter_qanet = util.AverageMeter()
        nbr_model += 1
        save_prob_qanet_start = []
        save_prob_qanet_end = []

    if (args.load_path_qanet_old):
        model_qanet_old = QANet_old(word_vectors=word_vectors,
                                    char_vectors=char_vectors,
                                    device=device,
                                    char_emb_dim=args.char_emb_dim,
                                    hidden_size=args.hidden_size,
                                    n_heads=args.n_heads,
                                    n_conv_emb_enc=args.n_conv_emb,
                                    n_conv_mod_enc=args.n_conv_mod,
                                    n_emb_enc_blocks=args.n_emb_blocks,
                                    n_mod_enc_blocks=args.n_mod_blocks)

        model_qanet_old = nn.DataParallel(model_qanet_old, gpu_ids)
        log.info(f'Loading checkpoint from {args.load_path_qanet_old}...')
        model_qanet_old = util.load_model(model_qanet_old,
                                          args.load_path_qanet_old,
                                          gpu_ids,
                                          return_step=False)
        model_qanet_old = model_qanet_old.to(device)
        model_qanet_old.eval()
        nll_meter_qanet_old = util.AverageMeter()
        nbr_model += 1
        save_prob_qanet_old_start = []
        save_prob_qanet_old_end = []

    if (args.load_path_qanet_inde):
        model_qanet_inde = QANet_independant_encoder(
            word_vectors=word_vectors,
            char_vectors=char_vectors,
            char_emb_dim=args.char_emb_dim,
            hidden_size=args.hidden_size,
            n_heads=args.n_heads,
            n_conv_emb_enc=args.n_conv_emb,
            n_conv_mod_enc=args.n_conv_mod,
            n_emb_enc_blocks=args.n_emb_blocks,
            n_mod_enc_blocks=args.n_mod_blocks,
            divisor_dim_kqv=args.divisor_dim_kqv)

        model_qanet_inde = nn.DataParallel(model_qanet_inde, gpu_ids)
        log.info(f'Loading checkpoint from {args.load_path_qanet_inde}...')
        model_qanet_inde = util.load_model(model_qanet_inde,
                                           args.load_path_qanet_inde,
                                           gpu_ids,
                                           return_step=False)
        model_qanet_inde = model_qanet_inde.to(device)
        model_qanet_inde.eval()
        nll_meter_qanet_inde = util.AverageMeter()
        nbr_model += 1
        save_prob_qanet_inde_start = []
        save_prob_qanet_inde_end = []

    if (args.load_path_qanet_s_e):
        model_qanet_s_e = QANet_S_E(word_vectors=word_vectors,
                                    char_vectors=char_vectors,
                                    char_emb_dim=args.char_emb_dim,
                                    hidden_size=args.hidden_size,
                                    n_heads=args.n_heads,
                                    n_conv_emb_enc=args.n_conv_emb,
                                    n_conv_mod_enc=args.n_conv_mod,
                                    n_emb_enc_blocks=args.n_emb_blocks,
                                    n_mod_enc_blocks=args.n_mod_blocks,
                                    divisor_dim_kqv=args.divisor_dim_kqv)

        model_qanet_s_e = nn.DataParallel(model_qanet_s_e, gpu_ids)
        log.info(f'Loading checkpoint from {args.load_path_qanet_s_e}...')
        model_qanet_s_e = util.load_model(model_qanet_s_e,
                                          args.load_path_qanet_s_e,
                                          gpu_ids,
                                          return_step=False)
        model_qanet_s_e = model_qanet_s_e.to(device)
        model_qanet_s_e.eval()
        nll_meter_qanet_s_e = util.AverageMeter()
        nbr_model += 1
        save_prob_qanet_s_e_start = []
        save_prob_qanet_s_e_end = []

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)[f'{args.split}_record_file']
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    log.info(f'Evaluating on {args.split} split...')
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f'{args.split}_eval_file']
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            cc_idxs = cc_idxs.to(device)
            qc_idxs = qc_idxs.to(device)
            batch_size = cw_idxs.size(0)

            y1, y2 = y1.to(device), y2.to(device)
            l_p1, l_p2 = [], []
            # Forward
            if (args.load_path_baseline):
                log_p1_baseline, log_p2_baseline = model_baseline(
                    cw_idxs, cc_idxs)
                loss_baseline = F.nll_loss(log_p1_baseline, y1) + F.nll_loss(
                    log_p2_baseline, y2)
                nll_meter_baseline.update(loss_baseline.item(), batch_size)
                l_p1 += [log_p1_baseline.exp()]
                l_p2 += [log_p2_baseline.exp()]
                if (args.save_probabilities):
                    save_prob_baseline_start += [
                        log_p1_baseline.exp().detach().cpu().numpy()
                    ]
                    save_prob_baseline_end += [
                        log_p2_baseline.exp().detach().cpu().numpy()
                    ]

            if (args.load_path_qanet):
                log_p1_qanet, log_p2_qanet = model_qanet(
                    cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                loss_qanet = F.nll_loss(log_p1_qanet, y1) + F.nll_loss(
                    log_p2_qanet, y2)
                nll_meter_qanet.update(loss_qanet.item(), batch_size)
                # Get F1 and EM scores
                l_p1 += [log_p1_qanet.exp()]
                l_p2 += [log_p2_qanet.exp()]
                if (args.save_probabilities):
                    save_prob_qanet_start += [
                        log_p1_qanet.exp().detach().cpu().numpy()
                    ]
                    save_prob_qanet_end += [
                        log_p2_qanet.exp().detach().cpu().numpy()
                    ]

            if (args.load_path_qanet_old):
                log_p1_qanet_old, log_p2_qanet_old = model_qanet_old(
                    cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                loss_qanet_old = F.nll_loss(log_p1_qanet_old, y1) + F.nll_loss(
                    log_p2_qanet_old, y2)
                nll_meter_qanet_old.update(loss_qanet_old.item(), batch_size)
                # Get F1 and EM scores
                l_p1 += [log_p1_qanet_old.exp()]
                l_p2 += [log_p2_qanet_old.exp()]
                if (args.save_probabilities):
                    save_prob_qanet_old_start += [
                        log_p1_qanet_old.exp().detach().cpu().numpy()
                    ]
                    save_prob_qanet_old_end += [
                        log_p2_qanet_old.exp().detach().cpu().numpy()
                    ]

            if (args.load_path_qanet_inde):
                log_p1_qanet_inde, log_p2_qanet_inde = model_qanet_inde(
                    cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                loss_qanet_inde = F.nll_loss(
                    log_p1_qanet_inde, y1) + F.nll_loss(log_p2_qanet_inde, y2)
                nll_meter_qanet_inde.update(loss_qanet_inde.item(), batch_size)
                # Get F1 and EM scores
                l_p1 += [log_p1_qanet_inde.exp()]
                l_p2 += [log_p2_qanet_inde.exp()]
                if (args.save_probabilities):
                    save_prob_qanet_inde_start += [
                        log_p1_qanet_inde.exp().detach().cpu().numpy()
                    ]
                    save_prob_qanet_inde_end += [
                        log_p2_qanet_inde.exp().detach().cpu().numpy()
                    ]

            if (args.load_path_qanet_s_e):
                log_p1_qanet_s_e, log_p2_qanet_s_e = model_qanet_s_e(
                    cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                loss_qanet_s_e = F.nll_loss(log_p1_qanet_s_e, y1) + F.nll_loss(
                    log_p2_qanet_s_e, y2)
                nll_meter_qanet_s_e.update(loss_qanet_s_e.item(), batch_size)
                # Get F1 and EM scores
                l_p1 += [log_p1_qanet_s_e.exp()]
                l_p2 += [log_p2_qanet_s_e.exp()]
                if (args.save_probabilities):
                    save_prob_qanet_s_e_start += [
                        log_p1_qanet_s_e.exp().detach().cpu().numpy()
                    ]
                    save_prob_qanet_s_e_end += [
                        log_p2_qanet_s_e.exp().detach().cpu().numpy()
                    ]

            if (args.load_path_bidaf):
                log_p1_bidaf, log_p2_bidaf = model_bidaf(
                    cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                loss_bidaf = F.nll_loss(log_p1_bidaf, y1) + F.nll_loss(
                    log_p2_bidaf, y2)
                nll_meter_bidaf.update(loss_bidaf.item(), batch_size)
                l_p1 += [log_p1_bidaf.exp()]
                l_p2 += [log_p2_bidaf.exp()]
                if (args.save_probabilities):
                    save_prob_bidaf_start += [
                        log_p1_bidaf.exp().detach().cpu().numpy()
                    ]
                    save_prob_bidaf_end += [
                        log_p2_bidaf.exp().detach().cpu().numpy()
                    ]

            if (args.load_path_bidaf_fusion):
                log_p1_bidaf_fu, log_p2_bidaf_fu = model_bidaf_fu(
                    cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                loss_bidaf_fu = F.nll_loss(log_p1_bidaf_fu, y1) + F.nll_loss(
                    log_p2_bidaf_fu, y2)
                nll_meter_bidaf_fu.update(loss_bidaf_fu.item(), batch_size)
                l_p1 += [log_p1_bidaf_fu.exp()]
                l_p2 += [log_p2_bidaf_fu.exp()]
                if (args.save_probabilities):
                    save_prob_bidaf_fu_start += [
                        log_p1_bidaf_fu.exp().detach().cpu().numpy()
                    ]
                    save_prob_bidaf_fu_end += [
                        log_p2_bidaf_fu.exp().detach().cpu().numpy()
                    ]

            p1, p2 = l_p1[0], l_p2[0]
            for i in range(1, nbr_model):
                p1 += l_p1[i]
                p2 += l_p2[i]
            p1 /= nbr_model
            p2 /= nbr_model

            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                if (args.load_path_qanet):
                    progress_bar.set_postfix(NLL=nll_meter_qanet.avg)
                elif (args.load_path_bidaf):
                    progress_bar.set_postfix(NLL=nll_meter_bidaf.avg)
                elif (args.load_path_bidaf_fusion):
                    progress_bar.set_postfix(NLL=nll_meter_bidaf_fu.avg)
                elif (args.load_path_qanet_old):
                    progress_bar.set_postfix(NLL=nll_meter_qanet_old.avg)
                elif (args.load_path_qanet_inde):
                    progress_bar.set_postfix(NLL=nll_meter_qanet_inde.avg)
                elif (args.load_path_qanet_s_e):
                    progress_bar.set_postfix(NLL=nll_meter_qanet_s_e.avg)
                else:
                    progress_bar.set_postfix(NLL=nll_meter_baseline.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    if (args.save_probabilities):
        if (args.load_path_baseline):
            with open(args.save_dir + "/probs_start", "wb") as fp:  #Pickling
                pickle.dump(save_prob_baseline_start, fp)
            with open(args.save_dir + "/probs_end", "wb") as fp:  #Pickling
                pickle.dump(save_prob_baseline_end, fp)

        if (args.load_path_bidaf):
            with open(args.save_dir + "/probs_start", "wb") as fp:  #Pickling
                pickle.dump(save_prob_bidaf_start, fp)
            with open(args.save_dir + "/probs_end", "wb") as fp:  #Pickling
                pickle.dump(save_prob_bidaf_end, fp)

        if (args.load_path_bidaf_fusion):
            with open(args.save_dir + "/probs_start", "wb") as fp:  #Pickling
                pickle.dump(save_prob_bidaf_fu_start, fp)
            with open(args.save_dir + "/probs_end", "wb") as fp:  #Pickling
                pickle.dump(save_prob_bidaf_fu_end, fp)

        if (args.load_path_qanet):
            with open(args.save_dir + "/probs_start", "wb") as fp:  #Pickling
                pickle.dump(save_prob_qanet_start, fp)
            with open(args.save_dir + "/probs_end", "wb") as fp:  #Pickling
                pickle.dump(save_prob_qanet_end, fp)

        if (args.load_path_qanet_old):
            with open(args.save_dir + "/probs_start", "wb") as fp:  #Pickling
                pickle.dump(save_prob_qanet_old_start, fp)
            with open(args.save_dir + "/probs_end", "wb") as fp:  #Pickling
                pickle.dump(save_prob_qanet_old_end, fp)

        if (args.load_path_qanet_inde):
            with open(args.save_dir + "/probs_start", "wb") as fp:  #Pickling
                pickle.dump(save_prob_qanet_inde_start, fp)
            with open(args.save_dir + "/probs_end", "wb") as fp:  #Pickling
                pickle.dump(save_prob_qanet_inde_end, fp)

        if (args.load_path_qanet_s_e):
            with open(args.save_dir + "/probs_start", "wb") as fp:  #Pickling
                pickle.dump(save_prob_qanet_s_e_start, fp)
            with open(args.save_dir + "/probs_end", "wb") as fp:  #Pickling
                pickle.dump(save_prob_qanet_s_e_end, fp)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        if (args.load_path_qanet):
            meter_avg = nll_meter_qanet.avg
        elif (args.load_path_bidaf):
            meter_avg = nll_meter_bidaf.avg
        elif (args.load_path_bidaf_fusion):
            meter_avg = nll_meter_bidaf_fu.avg
        elif (args.load_path_qanet_inde):
            meter_avg = nll_meter_qanet_inde.avg
        elif (args.load_path_qanet_s_e):
            meter_avg = nll_meter_qanet_s_e.avg
        elif (args.load_path_qanet_old):
            meter_avg = nll_meter_qanet_old.avg
        else:
            meter_avg = nll_meter_baseline.avg
        results_list = [('NLL', meter_avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
        log.info(f'{args.split.title()} {results_str}')

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info(f'Writing submission file to {sub_path}...')
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
예제 #19
0
def main(args):
    # Load TF-IDF from pickle
    scorer = TFIDF([])
    scorer.get_from_pickle()

    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)['{}_record_file'.format(args.split)]
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  char_vocab_size= 1376,
                  hidden_size=args.hidden_size)
    model = nn.DataParallel(model, gpu_ids)
    log.info('Loading checkpoint from {}...'.format(args.load_path))
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Evaluate
    log.info('Evaluating on {} split...'.format(args.split))
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}   # Predictions for submission
    eval_file = vars(args)['{}_eval_file'.format(args.split)]
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs,qc_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len, args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict,
                                                      ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    if (args.use_tfidf):
        # Apply TF-IDF filtering to pred_dict
        tf_idf_threshold = 2
        tf_idf_common_threshold = 1
        for key, value in pred_dict.items():
            if value != "":
                tf_idf_score = scorer.normalized_additive_idf_ignore_common_words(
                    value, threshold_frequency=tf_idf_common_threshold)
                if tf_idf_score < tf_idf_threshold:
                    pred_dict[key] = ''
                    pass
                    # print ("pred_dict: {}, pruned".format(tf_idf_score))
                else:
                    pass
                    # print ("pred_dict: {}, kept".format(tf_idf_score))

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [('NLL', nll_meter.avg),
                        ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                for k, v in results.items())
        log.info('{} {}'.format(args.split.title(), results_str))

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info('Writing submission file to {}...'.format(sub_path))
    with open(sub_path, 'w') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
def test_model(questions,
               context,
               use_squad_v2=True,
               model_path="../save/training-02/best.pth.tar"):
    # Set up logging
    #args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    #log = util.get_logger(args.save_dir, args.name)
    #log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    #args = get_test_args()
    device, gpu_ids = util.get_available_devices()
    batch_size = 64 * max(1, len(gpu_ids))

    # Get embeddings
    #print('Loading embeddings...')
    word_vectors = util.torch_from_json('../data/word_emb.json')

    # Get model
    #print('Building model...')
    model = BiDAF(word_vectors=word_vectors, hidden_size=100)
    model = nn.DataParallel(model, gpu_ids)
    #model_path = "../save/training-02/best.pth.tar"
    #print(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, model_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    #print('Building dataset...')
    #record_file = vars(args)[f'{args.split}_record_file']
    # my code start here
    # this is a simple approch when dealing with the user date
    # according to your approch of creating the interface you can change this code
    # and also you have to check the function "process_file" in the setup.py file
    processed_questions = []
    for index, question in enumerate(questions):
        processed_question = {
            "question": question,
            "id": index,
            "answers": [{
                "answer_start": 0,
                "text": "never mind"
            }]
        }
        processed_questions.append(processed_question)
    source = {"paragraphs": [{"qas": processed_questions, "context": context}]}
    word_counter, char_counter = Counter(), Counter()
    with open("../data/word2idx.json", "r") as f1:
        word2idx_dict = json.load(f1)
    with open("../data/char2idx.json", "r") as f2:
        char2idx_dict = json.load(f2)
    my_test_examples, my_test_eval = process_file(source, "my_test",
                                                  word_counter, char_counter)
    npz = build_features(my_test_examples,
                         "my_test",
                         word2idx_dict,
                         char2idx_dict,
                         is_test=True)
    #my code end here
    dataset = SQuAD(npz, use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=4,
                                  collate_fn=collate_fn)

    # Evaluate
    #print(f'Evaluating on {args.split} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    #eval_file = vars(args)[f'{args.split}_eval_file']
    gold_dict = my_test_eval
    #print("gold_dict", gold_dict)
    #print("data_loader", data_loader)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)
            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, 15, use_squad_v2)
            print("starts ", starts, " ends ", ends)

            # Log info
            progress_bar.update(batch_size)
            #if args.split != 'test':
            # No labels for the test set, so NLL would be invalid
            #progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    #print("my evaluation ....")

    #for el in pred_dict:
    #print(el, pred_dict[el])

    #for el in sub_dict:
    #print(el, sub_dict[el])
    return pred_dict
예제 #21
0
def evaluate(model,
             data_loader,
             device,
             eval_file,
             max_len,
             use_squad_v2,
             model_type,
             quick_eval=False):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)

    if not quick_eval:
        progress_len = len(data_loader.dataset)
    else:
        progress_len = len(data_loader)


    with torch.no_grad(), \
            tqdm(total=progress_len) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            if model_type == 'BiDAF' or model_type == 'Transformer':
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)

                log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs)

            # Forward
            elif model_type == 'BiDAFbase':
                log_p1, log_p2 = model(cw_idxs, qw_idxs)

            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #22
0
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, model_name, gpu_ids):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, cpos_idxs, cner_idxs, cw_ems, cw_tfs, qw_idxs, qc_idxs, qpos_idxs, qner_idxs, qw_ems, qw_tfs, y1, y2, ids in data_loader: # NEW
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            if 'baseline' in model_name:
                log_p1, log_p2 = model(cw_idxs, qw_idxs)

            elif model_name == 'BiDAF_char':
                # Additional setup for forward
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)

            elif (model_name == 'BiDAF_tag') or (model_name == 'BiDAF_tag_unfrozen') or (model_name == 'BiDAF_tag_loss') or (model_name == 'BiDAF_tag_unfrozen_loss'):
                # Additional setup for forward
                cc_idxs = cc_idxs.to(device)
                cpos_idxs = cpos_idxs.to(device)
                cner_idxs = cner_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                qpos_idxs = qpos_idxs.to(device)
                qner_idxs = qner_idxs.to(device)

                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs)

            elif (model_name == 'BiDAF_tag_ext') or (model_name == 'BiDAF_tag_ext_unfrozen'):
                # Additional setup for forward
                cc_idxs = cc_idxs.to(device)
                cpos_idxs = cpos_idxs.to(device)
                cner_idxs = cner_idxs.to(device)
                cw_ems = cw_ems.to(device)
                cw_tfs = cw_tfs.to(device)
                qc_idxs = qc_idxs.to(device)
                qpos_idxs = qpos_idxs.to(device)
                qner_idxs = qner_idxs.to(device)
                qw_ems = qw_ems.to(device)
                qw_tfs = qw_tfs.to(device)

                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs, cw_ems, qw_ems, cw_tfs, qw_tfs)

            elif args.name == 'coattn':
                max_c_len = cw_idxs.size(1)
                max_q_len = qw_idxs.size(1)

                c_len = []
                q_len = []

                for i in range(cw_idxs.size(0)):
                    if len((cw_idxs[i] == 0).nonzero()) != 0:
                        c_len_i = (cw_idxs[i] == 0).nonzero()[0].item()
                    else:
                        c_len_i = cw_idxs.size(1)

                    if len((qw_idxs[i] == 0).nonzero()) != 0:
                        q_len_i = (qw_idxs[i] == 0).nonzero()[0].item()
                    else:
                        q_len_i = qw_idxs.size(1)

                    c_len.append(int(c_len_i))
                    q_len.append(int(q_len_i))

                c_len = torch.Tensor(c_len).int()
                q_len = torch.Tensor(q_len).int()

                num_examples = int(cw_idxs.size(0) / len(gpu_ids))

                log_p1, log_p2 = model(max_c_len, max_q_len, cw_idxs, qw_idxs, c_len, q_len, num_examples, True, False)

            else: # default: run baseline
                log_p1, log_p2 = model(cw_idxs, qw_idxs)

            y1, y2 = y1.to(device), y2.to(device)
            #if model_name == 'coattn':
            #    loss = nn.CrossEntropyLoss()(log_p1, y1) + nn.CrossEntropyLoss()(log_p2, y2)
            #else:
            #    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            #if model_name != 'coattn':
            #    p1, p2 = log_p1.exp(), log_p2.exp()
            #else:
            #    p1, p2 = log_p1, log_p2
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict,
                                           ids.tolist(),
                                           starts.tolist(),
                                           ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg),
                    ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #23
0
파일: train.py 프로젝트: tingkanp/squad
def evaluate(model, data_loader, device, model_name, eval_file, max_len,
             use_squad_v2):
    nll_meter = util.AverageMeter()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, c_pos_idxs, c_ner_idxs,c_iob_idxs, \
             qw_idxs, qc_idxs, q_pos_idxs, q_ner_idxs, q_iob_idxs,\
             y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            if model_name == 'BiDAF'.lower():
                log_p1, log_p2 = model(cw_idxs, qw_idxs)

            elif model_name == "BiDAF_Char".lower():
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)

                log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs)

            elif model_name == "BiDAF_CharTag".lower():
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                c_pos_idxs = c_pos_idxs.to(device)
                c_ner_idxs = c_ner_idxs.to(device)
                c_iob_idxs = c_iob_idxs.to(device)
                q_pos_idxs = q_pos_idxs.to(device)
                q_ner_idxs = q_ner_idxs.to(device)
                q_iob_idxs = q_iob_idxs.to(device)

                log_p1, log_p2 = model(cw_idxs, cc_idxs, c_pos_idxs,
                                       c_ner_idxs, c_iob_idxs, qw_idxs,
                                       qc_idxs, q_pos_idxs, q_ner_idxs,
                                       q_iob_idxs)
            else:
                raise NameError('No model named ' + model_name)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(NLL=nll_meter.avg)

            preds, _ = util.convert_tokens(gold_dict, ids.tolist(),
                                           starts.tolist(), ends.tolist(),
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #24
0
def main(args):
    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    ch_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  ch_vectors=ch_vectors,
                  hidden_size=args.hidden_size)
    model = nn.DataParallel(model, gpu_ids)
    log.info(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)[f'{args.split}_record_file']
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    log.info(f'Evaluating on {args.split} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f'{args.split}_eval_file']
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            cc_idxs = cc_idxs.to(device)
            qc_idxs = qc_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
        log.info(f'{args.split.title()} {results_str}')

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info(f'Writing submission file to {sub_path}...')
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
예제 #25
0
def evaluate(model, data_loader, device, eval_file, max_len, use_squad_v2, model_name="", a1=0.5, a2=0.5):
    meter = util.AverageMeter()

    # setup losses
    bceLoss = nn.BCELoss()
    ceLoss = nn.CrossEntropyLoss()

    model.eval()
    pred_dict = {}
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(data_loader.dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            cc_idxs = cc_idxs.to(device)
            qc_idxs = qc_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            y1, y2 = y1.to(device), y2.to(device)
            if model_name == 'sketchy':
                yi = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                loss = bceLoss(yi, torch.where(
                    y1 == 0, 0, 1).type(torch.FloatTensor))
                meter.update(loss.item(), batch_size)
                starts, ends = [[0 if yi[i] == 0 else 1 for i, y in enumerate(
                    y1)], [0 if yi[i] == 0 else 2 for i, y in enumerate(y2)]]
            elif model_name == 'intensive':
                yi, log_p1, log_p2 = model(
                    cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                loss = a1 * bceLoss(yi, torch.where(y1 == 0, 0, 1).type(
                    torch.FloatTensor)) + a2 * (F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2))
                #loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                meter.update(loss.item(), batch_size)
                # Get F1 and EM scores
                p1 = log_p1.exp()
                p2 = log_p2.exp()
                # print(p1[0,:])
                # print(p1)
                # print(p2[0,:])
                # print(p2)
                starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)
                starts, ends = starts.tolist(), ends.tolist()
            elif model_name == 'retro':
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                meter.update(loss.item(), batch_size)
                p1, p2 = log_p1.exp(), log_p2.exp()
                starts, ends = util.discretize(p1, p2, max_len, use_squad_v2)
                starts, ends = starts.tolist(), ends.tolist()
            else:
                raise ValueError(
                    'invalid --model_name, sketchy or intensive required')

            print("starts: ", starts, "Truth", y1)
            print("ends: ", ends, "Truth: ", y2)

            # Log info
            progress_bar.update(batch_size)
            progress_bar.set_postfix(loss_calc=meter.avg)

            preds, _ = util.convert_tokens(gold_dict,
                                           ids.tolist(),
                                           starts,
                                           ends,
                                           use_squad_v2)
            pred_dict.update(preds)

    model.train()

    results = util.eval_dicts(gold_dict, pred_dict, use_squad_v2)
    results_list = [('Loss', meter.avg),
                    ('F1', results['F1']),
                    ('EM', results['EM'])]
    if use_squad_v2:
        results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)

    return results, pred_dict
예제 #26
0
def main(args, actions=None):
    """"
    actions is a tuple (action, number of actions to be taken)

    action can be either: "substitute", "delete" or "add".
    number of actions to be taken: the number of words to apply the "substitute", "delete" or "add" action.

    """

    # check that actions parameters received
    #print("actions: ",actions)

    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    if args.model == 'bidaf':
        model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size)
    elif args.model == 'bidafextra':
        model = BiDAFExtra(word_vectors=word_vectors, args=args)
    elif args.model == 'fusionnet':
        model = FusionNet(word_vectors=word_vectors, args=args)

    model = nn.DataParallel(model, gpu_ids)
    log.info(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)[f'{args.split}_record_file']
    dataset = SQuAD(record_file, args)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)
    # print("*"*80)
    #print(len(dataset.question_idxs))

    #for question_idx in dataset.question_idxs:
    #    print(question_idx)
    #    print("*" * 80)

    #print(self.question_idxs[question_idx])
    #self.question_idxs[idx]
    # print("data_loader: ",data_loader)
    # Evaluate
    log.info(f'Evaluating on {args.split} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f'{args.split}_eval_file']
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)

    count_questions_type = defaultdict(lambda: 0)

    audit_trail_from_question_type = defaultdict(lambda: [])
    list_of_interrogative_pronouns = [
        "what", "whose", "why", "which", "where", "when", "how", "who", "whom"
    ]

    for index in range(1, len(gold_dict)):
        # transform the question in lower case to simplify the analysis, thus losing the benefit of the capital letters
        # possibly indicating the position of the interrogative pronoun in the sentence.
        question_lower_case = gold_dict[str(index)]['question'].lower()

        list_question_lower_case_with_punctuation = question_lower_case.translate(
            {ord(i): " "
             for i in "'"}).split()

        #
        question_lower_case = []
        for item in list_question_lower_case_with_punctuation:
            question_lower_case.append(
                item.translate({ord(i): ""
                                for i in ",.<>!@£$%^&*()_-+=?"}))

        # defining a variable for the first word
        first_word_question_lower_case = question_lower_case[0]

        # defining variable for the second word
        second_word_question_lower_case = question_lower_case[1]

        # defining variable for the first and second word
        combined_first_and_second_words = first_word_question_lower_case + " " + second_word_question_lower_case

        # Analyzing the sentence
        if first_word_question_lower_case in list_of_interrogative_pronouns:
            count_questions_type[first_word_question_lower_case] += 1
            audit_trail_from_question_type[
                first_word_question_lower_case].append(str(index))
        # composed question starting by in
        elif first_word_question_lower_case == "in":
            if second_word_question_lower_case in list_of_interrogative_pronouns and second_word_question_lower_case != "whose":
                count_questions_type[combined_first_and_second_words] += 1
                audit_trail_from_question_type[
                    combined_first_and_second_words].append(str(index))
            else:
                pronoun = find_first_interrogative_pronoun(
                    list_of_interrogative_pronouns, question_lower_case)
                count_questions_type[pronoun] += 1
                audit_trail_from_question_type[pronoun].append(str(index))

        # composed question starting by by
        elif first_word_question_lower_case == "by":
            if second_word_question_lower_case in list_of_interrogative_pronouns \
                    and second_word_question_lower_case != "whom" \
                    and second_word_question_lower_case != "which" \
                    and second_word_question_lower_case != "when" \
                    and second_word_question_lower_case != "how":
                count_questions_type[combined_first_and_second_words] += 1
                audit_trail_from_question_type[
                    combined_first_and_second_words].append(str(index))
            else:
                pronoun = find_first_interrogative_pronoun(
                    list_of_interrogative_pronouns, question_lower_case)
                count_questions_type[pronoun] += 1
                audit_trail_from_question_type[pronoun].append(str(index))

        else:
            pronoun = find_first_interrogative_pronoun(
                list_of_interrogative_pronouns, question_lower_case)
            count_questions_type[pronoun] += 1
            audit_trail_from_question_type[pronoun].append(str(index))

    reverse_dict_by_value = OrderedDict(
        sorted(count_questions_type.items(), key=lambda x: x[1]))
    total_questions = sum(count_questions_type.values())

    with torch.no_grad(), \
         tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader:
            # Setup for forward

            # **********************************************************
            #
            # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
            #
            # Where we make the modif:

            if actions[0] == "substitute":
                # substitute to random token in each question of the batch (substitution is made within the same sentence:
                batch_size = cw_idxs.size()[0]
                number_of_actions = actions[1]
                for _ in range(number_of_actions):
                    length_index_batch = cw_idxs.size()[1]
                    for i in range(batch_size):
                        tensor_with_zero_value = ((
                            cw_idxs[i] == 0).nonzero()).squeeze()

                        try:
                            first_zero_value = torch.min(
                                tensor_with_zero_value)
                        except:
                            first_zero_value = length_index_batch

                        if first_zero_value > 2:
                            select_item_idx_1 = random.randint(
                                0, first_zero_value - 1)
                            select_item_idx_2 = random.randint(
                                0, first_zero_value - 1)
                            save_value_1 = copy.deepcopy(
                                cw_idxs[i, select_item_idx_1])
                            cw_idxs[i, select_item_idx_1] = cw_idxs[
                                i, select_item_idx_2]
                            cw_idxs[i, select_item_idx_2] = save_value_1

            elif actions[0] == "delete":
                # substitute to random token in each question of the batch (substitution is made within the same sentence:
                batch_size = cw_idxs.size()[0]
                number_of_actions = actions[1]
                for _ in range(number_of_actions):
                    length_index_batch = cw_idxs.size()[1]
                    for i in range(batch_size):
                        tensor_with_zero_value = ((
                            cw_idxs[i] == 0).nonzero()).squeeze()

                        try:
                            first_zero_value = torch.min(
                                tensor_with_zero_value)
                        except:
                            first_zero_value = length_index_batch

                        if first_zero_value > 2:
                            print("debug:", i)
                            print("> size before amendment of cw_idxs",
                                  cw_idxs.size())
                            print(cw_idxs[i])
                            select_item_idx_1 = random.randint(
                                0, first_zero_value - 1)
                            #print("section 1 :", cw_idxs[i, 0:select_item_idx_1)
                            #print("remove:", cw_idxs[i,0:select_item_idx_1)
                            #print("section 2 :", cw_idxs[i, 0:select_item_idx_1)
                            cw_idxs[i, :] = torch.cat(
                                (cw_idxs[i, 0:select_item_idx_1],
                                 cw_idxs[i, select_item_idx_1 + 1:],
                                 torch.tensor([0])), -1)
                            print("> size before amendment of cw_idxs",
                                  cw_idxs.size())
                            print(cw_idxs[i])

            elif actions[0] == "add":
                batch_size = cw_idxs.size()[0]
                number_of_actions = actions[1]
                for _ in range(number_of_actions):
                    length_index_batch = cw_idxs.size()[1]
                    for i in range(batch_size):
                        tensor_with_zero_value = ((
                            cw_idxs[i] == 0).nonzero()).squeeze()

                        try:
                            first_zero_value = torch.min(
                                tensor_with_zero_value)
                        except:
                            first_zero_value = length_index_batch

                        if first_zero_value > 2:
                            select_item_idx_1 = random.randint(
                                0, first_zero_value - 1)
                            cw_idxs[i, select_item_idx_1] = random.randint(
                                1, 50000)

            elif actions[0] == "add2":
                # substitute to random token in each question of the batch (substitution is made within the same sentence:
                batch_size = cw_idxs.size()[0]
                number_of_actions = actions[1]
                for _ in range(number_of_actions):
                    length_index_batch = cw_idxs.size()[1]
                    for i in range(batch_size):
                        tensor_with_zero_value = ((
                            cw_idxs[i] == 0).nonzero()).squeeze()

                        try:
                            first_zero_value = torch.min(
                                tensor_with_zero_value)
                        except:
                            first_zero_value = length_index_batch

                        if first_zero_value > 2:
                            select_item_idx_1 = random.randint(
                                0, first_zero_value - 1)
                            select_item_idx_2 = random.randint(
                                0, first_zero_value - 1)
                            cw_idxs[i, select_item_idx_1] = random.randint(
                                1, 50000)
                            cw_idxs[i, select_item_idx_2] = random.randint(
                                1, 50000)

            else:
                print("Incorrect command: exiting")
                exit()
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            if args.model == 'bidaf':
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
            else:
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cw_pos, cw_ner,
                                       cw_freq, cqw_extra)

            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)

        # Printing information for questions without interrogative pronouns
        """"
        print("len(gold_dict): ", len(gold_dict))
        print("len(pred_dict): ", len(pred_dict))
        print("Is gold_dict.keys() identical to pred_dict.keys(): ", gold_dict.keys()==pred_dict.keys())
        if gold_dict.keys()!=pred_dict.keys():
            for key in gold_dict.keys():
                if key not in pred_dict.keys():
                    print("key ", key, " missing in pred_dict.keys(")
        """
        results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Computing the F1 score for each type of question
        print("for ", actions, ": ", results['F1'])

        # create a list of the types of questions by extracting the keys from the dict audit_trail_from_question_type

        # Log to console
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
        log.info(f'{args.split.title()} {results_str}')

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info(f'Writing submission file to {sub_path}...')
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])

    return results['F1']
예제 #27
0
def main(args):
    args.save_dir = util.get_save_dir(args.save_dir,
                                      "exp1_training",
                                      training=False)
    log = get_logger(args.logging_dir, "exp1_training")
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors, hidden_size=args.hidden_size)
    model = nn.DataParallel(model, gpu_ids)

    log.info(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, c.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info('Building dataset...')
    dataset = SQuAD(args.test_record_file, True)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    log.info(f'Evaluating on {args.datasplit} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    with open(args.test_eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, c.max_ans_len, True)

            # Log info
            progress_bar.update(batch_size)

            # Not using the unlabeled test set
            #            if args.split != 'test':
            #                # No labels for the test set, so NLL would be invalid
            #                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(), True)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    results = util.eval_dicts(gold_dict, pred_dict, True)
    results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                    ('EM', results['EM'])]
    results_list.append(('AvNA', results['AvNA']))
    results = OrderedDict(results_list)
    # Log to console
    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
    log.info(f'{args.datasplit} {results_str}')
    # Log to TensorBoard
    tbx = SummaryWriter(c.save_dir)
    util.visualize(tbx,
                   pred_dict=pred_dict,
                   eval_path=args.test_eval_file,
                   step=0,
                   split=args.datasplit,
                   num_visuals=args.num_visuals)
예제 #28
0
CM = 0.01
RA = 10
Em = -65e-3
pulse_dur = 100e-3
pulse_amp = 0.1e-9
pulse_delay = 50e-3

# Create a neutral directory for the neuron
neuron = moose.Neutral('/neuron')

# Create the soma under the neuron directory
soma = u.createCompartment(neuron, 'swagginSoma', soma_l, soma_rad, RM, CM, RA,
                           Em)

# Create the dendritic compartments under the neuron directory
dend_branch = u.discretize(neuron, numDends, dend_l, dend_rad, RM, CM, RA, Em)

# Connect the soma to the first element of the dendritic branch
moose.connect(soma, 'axialOut', dend_branch[0], 'handleAxial')

# Create a pulse and connect it to the soma
soma_pulse = u.createPulse(soma, 'rollingWave', pulse_dur, pulse_amp,
                           pulse_delay)

# Create data tables to store the voltage for the soma and each compartment
# making up the dendritic branch
data = moose.Neutral('/data')
soma_Vm = u.createDataTables(soma, data)
dend_tables = []
for dend in dend_branch:
    dend_tables.append(u.createDataTables(dend, data))
예제 #29
0
def main(args):
    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    models = {}

    if args.use_ensemble:

        total_models = 0
        for model_name in ['bidaf', 'bidafextra', 'fusionnet']:

            models_list = []

            for model_file in glob.glob(
                    f'{args.load_path}/{model_name}-*/{args.ensemble_models}'):

                # Get model
                log.info('Building model...')
                if model_name == 'bidaf':
                    model = BiDAF(word_vectors=word_vectors,
                                  hidden_size=args.hidden_size)
                elif model_name == 'bidafextra':
                    model = BiDAFExtra(word_vectors=word_vectors, args=args)
                elif model_name == 'fusionnet':
                    model = FusionNet(word_vectors=word_vectors, args=args)

                model = nn.DataParallel(model, gpu_ids)
                log.info(f'Loading checkpoint from {model_file}...')
                model = util.load_model(model,
                                        model_file,
                                        gpu_ids,
                                        return_step=False)

                # Load each model on CPU (have plenty of RAM ...)
                model = model.cpu()
                model.eval()

                models_list.append(model)

            models[model_name] = models_list

            total_models += len(models_list)

        log.info(f'Using an ensemble of {total_models} models')

    else:

        device, gpu_ids = util.get_available_devices()

        # Get model
        log.info('Building model...')
        if args.model == 'bidaf':
            model = BiDAF(word_vectors=word_vectors,
                          hidden_size=args.hidden_size)
        elif args.model == 'bidafextra':
            model = BiDAFExtra(word_vectors=word_vectors, args=args)
        elif args.model == 'fusionnet':
            model = FusionNet(word_vectors=word_vectors, args=args)

        model = nn.DataParallel(model, gpu_ids)
        log.info(f'Loading checkpoint from {args.load_path}...')
        model = util.load_model(model,
                                args.load_path,
                                gpu_ids,
                                return_step=False)
        model = model.to(device)
        model.eval()

        models[args.model] = [model]

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)[f'{args.split}_record_file']
    dataset = SQuAD(record_file, args)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    log.info(f'Evaluating on {args.split} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f'{args.split}_eval_file']
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, cw_pos, cw_ner, cw_freq, cqw_extra, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            p1s = []
            p2s = []

            for model_name in models:
                for model in models[model_name]:
                    # Move model to GPU to evaluate
                    model = model.to(device)

                    # Forward
                    if model_name == 'bidaf':
                        log_p1, log_p2 = model.to(device)(cw_idxs, qw_idxs)
                    else:
                        log_p1, log_p2 = model.to(device)(cw_idxs, qw_idxs,
                                                          cw_pos, cw_ner,
                                                          cw_freq, cqw_extra)

                    log_p1, log_p2 = log_p1.cpu(), log_p2.cpu()

                    if not args.use_ensemble:
                        y1, y2 = y1.to(device), y2.to(device)
                        log_p1, log_p2 = log_p1.to(device), log_p2.to(device)

                        loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                        nll_meter.update(loss.item(), batch_size)

                    # Move model back to CPU to release GPU memory
                    model = model.cpu()

                    # Get F1 and EM scores
                    p1, p2 = log_p1.exp().unsqueeze(
                        -1).cpu(), log_p2.exp().unsqueeze(-1).cpu()
                    p1s.append(p1), p2s.append(p2)

            best_ps = torch.max(
                torch.cat([
                    torch.cat(p1s, -1).unsqueeze(-1),
                    torch.cat(p2s, -1).unsqueeze(-1)
                ], -1), -2)[0]

            p1, p2 = best_ps[:, :, 0], best_ps[:, :, 1]
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
        log.info(f'{args.split.title()} {results_str}')

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info(f'Writing submission file to {sub_path}...')
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
예제 #30
0
def main(args):
    # Set up
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    device, gpu_ids = util.get_available_devices()

    # Get embeddings
    log.info('Loading embeddings...')
    with open('data/meta.msgpack', 'rb') as f:
        meta = msgpack.load(f, encoding='utf8')
    embedding = torch.Tensor(meta['embedding'])
    opt = vars(args)
    opt['pretrained_words'] = True
    opt['vocab_size'] = embedding.size(0)
    opt['embedding_dim'] = embedding.size(1)
    opt['pos_size'] = len(meta['vocab_tag'])
    opt['ner_size'] = len(meta['vocab_ent'])

    # Get model
    log.info('Building model...')
    model = DRQA(opt, embedding=embedding)
    model = nn.DataParallel(model, gpu_ids)
    log.info(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()
    BatchGen.pos_size = opt['pos_size']
    BatchGen.ner_size = opt['ner_size']

    # Get data loader
    log.info('Building dataset...')
    with open(opt['data_file'], 'rb') as f:
        data = msgpack.load(f, encoding='utf8')
    test = data['test']

    # Evaluate
    log.info(f'Evaluating on test split...')
    batches = BatchGen(test,
                       batch_size=args.batch_size,
                       evaluation=True,
                       is_test=True,
                       gpu=args.cuda)

    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = './data/test_eval.json'
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(test)) as progress_bar:
        model.eval()
        for i, batch in enumerate(batches):
            # Setup for forward
            inputs = [e.to(device) for e in batch[:7]]
            ids = batch[-1]

            # Forward
            with torch.no_grad():
                score_s, score_e = model(*inputs)

            p1, p2 = score_s, score_e
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(args.batch_size)
            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
        log.info(f'{args.split.title()} {results_str}')

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info(f'Writing submission file to {sub_path}...')
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])