예제 #1
0
def set_sim(answer, prediction):

    ground_truths = []
    for a in answer:
        ground_truths.append(" ".join([w for w in a]))

    res = utils.metric_max_over_ground_truths(utils.f1_score, prediction,
                                              ground_truths)
    return res
예제 #2
0
def validate_with_doc(args, data_loader, model, global_stats, 
                                 exs_with_doc, docs_by_question, mode):
    '''Run one full unofficial validation with docs.
    Unofficial = doesn't use SQuAD script.
    '''
    eval_time = utils.Timer()
    f1 = utils.AverageMeter()
    exact_match = utils.AverageMeter()

    logger.info('validate_with_doc')
    
    # Intialize counters
    examples = 0
    
    aa = [0.0 for i in range(num_docs)] # increment only if example has answer
    bb = [0.0 for i in range(num_docs)] # increment regardless
    
    for idx, ex_with_doc in enumerate(data_loader):
        ex = ex_with_doc[0]
        batch_size, ex_id = ex[0].size(0), ex[-1]
        
        # ---------------------------------------------------------------------
        # Document Selector
        # ---------------------------------------------------------------------
        
        '''
        ex_with_doc = 
        [tensor]  x1 = document word indices            [batch * len_d]
        [tensor]  x1_f = document word features indices [batch * len_d * nfeat]
        [tensor]  x1_mask = document padding mask       [batch * len_d]
        [tensor]  x2 = question word indices            [batch * len_q]
        [tensor]  x2_mask = question padding mask       [batch * len_q]
        [list]    indices                               [batch]
        '''
        
        scores_doc_num = model.predict_with_doc(ex_with_doc)
        scores = [{} for i in range(batch_size)]
        
        # ---------------------------------------------------------------------
        # Document Reader
        # ---------------------------------------------------------------------
        for idx_doc in range(0, num_docs):
            ex = ex_with_doc[idx_doc]
            pred_s, pred_e, pred_score = model.predict(
                    ex, top_n=display_num)
            
            for i in range(batch_size):
                idx_doc_i = idx_doc %len(docs_by_question[ex_id[i]])
                doc_text = docs_by_question[ex_id[i]][idx_doc_i]['document']
                
                # try to read the 10 best predicted answers (this may trigger 
                # an 'index out of range' exception)
                for k in range(display_num):
                    
                    try:
                        prediction = [doc_text[j] for j in range(pred_s[i][k], 
                                      pred_e[i][k]+1)]
                        prediction = ' '.join(prediction).lower()
                        
                        # update prediction scores
                        if (prediction not in scores[i]): 
                            scores[i][prediction] = 0
                        scores[i][prediction] += (pred_score[i][k] * 
                              scores_doc_num[i][idx_doc])
                    
                    except:
                        pass 
        
        # Get the 10 most likely answers from the batch and see if the answer 
        # is actually in there           
        for i in range(batch_size):
            _, indices = scores_doc_num[i].sort(0, descending = True)
            
            for j in range(0, display_num):
                idx_doc = indices[j]
                idx_doc_i = idx_doc %len(docs_by_question[ex_id[i]])
                
                doc_text = docs_by_question[ex_id[i]][idx_doc_i]['document']
                ex_answer = exs_with_doc[ex_id[i]]['answer']
                
                # Looking for the answer in the document...
                if (has_answer(args, 
                               ex_answer, 
                               doc_text)[0]):
                    aa[j]= aa[j] + 1
                    
                bb[j]= bb[j]+1

        # Update performance metrics
        for i in range(batch_size):
            
            best_score = 0
            prediction = ''
            for key in scores[i]:
                if (scores[i][key] > best_score):
                    best_score = scores[i][key]
                    prediction = key
            
            ground_truths = []
            ex_answer = exs_with_doc[ex_id[i]]['answer']
            
            # Ground truth answers
            if (args.dataset == 'CuratedTrec'): # not applicable
                ground_truths = ex_answer
            else: 
                for a in ex_answer:
                    ground_truths.append(' '.join([w for w in a]))
                    
            exact_match.update(
                    utils.metric_max_over_ground_truths(
                            utils.exact_match_score, prediction, ground_truths))
            
            f1.update(
                    utils.metric_max_over_ground_truths(
                            utils.f1_score, prediction, ground_truths))
            
        examples += batch_size
        
        if (mode=='train' and examples>=1000):
            break
    
    try:
        for j in range(display_num):
            if (j>0):
                aa[j]= aa[j]+aa[j-1]
                bb[j]= bb[j]+bb[j-1]
    except:
        pass
    
    txt =  '{} valid official with doc: Epoch = {} | EM = {:.2f} | '
    txt += 'F1 = {:.2f} | examples = {} | valid time = {:.2f} (s)'
    logger.info(txt.format(
            mode, global_stats['epoch'], exact_match.avg * 100, 
            f1.avg * 100, examples, eval_time.time()))

    return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}
예제 #3
0
def validate_unofficial_with_doc(args, data_loader, model, global_stats,
                                 exs_with_doc, docs_by_question, mode):
    """Run one full unofficial validation with docs.
    Unofficial = doesn't use SQuAD script.
    """
    eval_time = utils.Timer()
    f1 = utils.AverageMeter()
    exact_match = utils.AverageMeter()

    out_set = set({33, 42, 45, 70, 39})
    logger.info("validate_unofficial_with_doc")
    # Run through examples

    examples = 0
    aa = [0.0 for i in range(vector.num_docs)]
    bb = [0.0 for i in range(vector.num_docs)]
    aa_sum = 0.0
    display_num = 10
    for idx, ex_with_doc in enumerate(data_loader):
        ex = ex_with_doc[0]
        batch_size, question, ex_id = ex[0].size(0), ex[3], ex[-1]
        scores_doc_num = model.predict_with_doc(ex_with_doc)
        scores = [{} for i in range(batch_size)]

        tot_sum = [0.0 for i in range(batch_size)]
        tot_sum1 = [0.0 for i in range(batch_size)]
        neg_sum = [0.0 for i in range(batch_size)]
        min_sum = [[] for i in range(batch_size)]
        min_sum1 = [[] for i in range(batch_size)]

        for idx_doc in range(0, vector.num_docs):
            ex = ex_with_doc[idx_doc]
            pred_s, pred_e, pred_score = model.predict(ex, top_n=10)
            for i in range(batch_size):
                doc_text = docs_by_question[ex_id[i]][idx_doc % len(
                    docs_by_question[ex_id[i]])]["document"]
                has_answer_t = has_answer(args,
                                          exs_with_doc[ex_id[i]]['answer'],
                                          doc_text)

                for k in range(10):
                    try:
                        prediction = []
                        for j in range(pred_s[i][k], pred_e[i][k] + 1):
                            prediction.append(doc_text[j])
                        prediction = " ".join(prediction).lower()
                        if (prediction not in scores[i]):
                            scores[i][prediction] = 0
                        scores[i][prediction] += pred_score[i][
                            k] * scores_doc_num[i][idx_doc]
                    except:
                        pass
        for i in range(batch_size):
            _, indices = scores_doc_num[i].sort(0, descending=True)
            for j in range(0, display_num):
                idx_doc = indices[j]
                doc_text = docs_by_question[ex_id[i]][idx_doc % len(
                    docs_by_question[ex_id[i]])]["document"]
                if (has_answer(args, exs_with_doc[ex_id[i]]['answer'],
                               doc_text)[0]):

                    aa[j] = aa[j] + 1
                bb[j] = bb[j] + 1

        for i in range(batch_size):

            best_score = 0
            prediction = ""
            for key in scores[i]:
                if (scores[i][key] > best_score):
                    best_score = scores[i][key]
                    prediction = key

            # Compute metrics
            ground_truths = []
            answer = exs_with_doc[ex_id[i]]['answer']
            if (args.dataset == "CuratedTrec"):
                ground_truths = answer
            else:
                for a in answer:
                    ground_truths.append(" ".join([w for w in a]))
            #logger.info(prediction)
            #logger.info(ground_truths)
            exact_match.update(
                utils.metric_max_over_ground_truths(utils.exact_match_score,
                                                    prediction, ground_truths))
            f1.update(
                utils.metric_max_over_ground_truths(utils.f1_score, prediction,
                                                    ground_truths))
            a = sorted(scores[i].items(), key=lambda d: d[1], reverse=True)

        examples += batch_size
        if (mode == "train" and examples >= 1000):
            break
    try:
        for j in range(0, display_num):
            if (j > 0):
                aa[j] = aa[j] + aa[j - 1]
                bb[j] = bb[j] + bb[j - 1]
            logger.info(aa[j] / bb[j])
    except:
        pass
    logger.info(aa_sum)
    if (mode == 'dev' or mode == 'train'):
        g.write("*" * 50 + "\n")
        g.close()
    logger.info('%s valid official with doc: Epoch = %d | EM = %.2f | ' %
                (mode, global_stats['epoch'], exact_match.avg * 100) +
                'F1 = %.2f | examples = %d | valid time = %.2f (s)' %
                (f1.avg * 100, examples, eval_time.time()))

    return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}