예제 #1
0
def validate_official(args, data_loader, model, global_stats,
                      offsets, texts, answers):
    """Run one full official validation. Uses exact spans and same
    exact match/F1 score computation as in the SQuAD script.

    Extra arguments:
        offsets: The character start/end indices for the tokens in each context.
        texts: Map of qid --> raw text of examples context (matches offsets).
        answers: Map of qid --> list of accepted answers.
    """
    eval_time = utils.Timer()
    f1 = utils.AverageMeter()
    exact_match = utils.AverageMeter()

    # Run through examples
    examples = 0
    for ex in data_loader:
        ex_id, batch_size = ex[-1], ex[0].size(0)
        chosen_offset = ex[-2]
        pred_s, pred_e, _ = model.predict(ex)

        for i in range(batch_size):
            # s_offset = offsets[ex_id[i]][pred_s[i][0]][0]
            # e_offset = offsets[ex_id[i]][pred_e[i][0]][1]

            if args.use_sentence_selector:
                s_offset = chosen_offset[i][pred_s[i][0]][0]
                e_offset = chosen_offset[i][pred_e[i][0]][1]
            else:
                s_offset = offsets[ex_id[i]][pred_s[i][0]][0]
                e_offset = offsets[ex_id[i]][pred_e[i][0]][1]

            prediction = texts[ex_id[i]][s_offset:e_offset]

            # Compute metrics
            ground_truths = answers[ex_id[i]]
            exact_match.update(utils.metric_max_over_ground_truths(
                utils.exact_match_score, prediction, ground_truths))
            f1.update(utils.metric_max_over_ground_truths(
                utils.f1_score, prediction, ground_truths))

        examples += batch_size

    logger.info('dev valid official: Epoch = %d | EM = %.2f | ' %
                (global_stats['epoch'], exact_match.avg * 100) +
                'F1 = %.2f | examples = %d | valid time = %.2f (s)' %
                (f1.avg * 100, examples, eval_time.time()))

    return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}
def compute_paragraph_score(sample):
    '''
    对于每段,计算和问题的f1-score
    :param sample:
    :return:
    '''
    scores = []
    question = sample['segmented_question']  # 取出问题的分词形式(还是中文 不是id)

    for doc in sample['documents']:
        doc['segmented_paragraphs_scores'] = []  # 给每篇文章加个域 段落匹配得分
        for p_idx, para_tokens in enumerate(
                doc['segmented_paragraphs']):  # 此处遍历的是一篇文章的每段话(分词形式)
            if len(question) > 0:
                related_score = metric_max_over_ground_truths(
                    f1_score, para_tokens, [question])
            else:
                related_score = 0.0

            doc['segmented_paragraphs_scores'].append(
                related_score)  # 每段话与问题的得分
            scores.append(related_score)  # 获取每段文字与问题的相似得分
예제 #3
0
def validate_adversarial(args, model, global_stats, mode="dev"):
    # create dataloader for dev sets, load thier jsons, integrate the function


    for idx, dataset_file in enumerate(args.adv_dev_json):

        predictions = {}

        logger.info("Validating Adversarial Dataset %s" % dataset_file)
        exs = utils.load_data(args, args.adv_dev_file[idx])
        logger.info('Num dev examples = %d' % len(exs))
        ## Create dataloader
        dev_dataset = data.ReaderDataset(exs, model, single_answer=False)
        if args.sort_by_len:
            dev_sampler = data.SortedBatchSampler(dev_dataset.lengths(),
                                                  args.test_batch_size,
                                                  shuffle=False)
        else:
            dev_sampler = torch.utils.data.sampler.SequentialSampler(dev_dataset)
        # if args.use_sentence_selector:
        #     batching_function = vector.batchify_sentences
        # else:
        batching_function = vector.batchify
        dev_loader = torch.utils.data.DataLoader(
            dev_dataset,
            batch_size=args.test_batch_size,
            sampler=dev_sampler,
            num_workers=args.data_workers,
            collate_fn=batching_function,
            pin_memory=args.cuda,
        )

        texts = utils.load_text(dataset_file)
        offsets = {ex['id']: ex['offsets'] for ex in exs}
        answers = utils.load_answers(dataset_file)

        eval_time = utils.Timer()
        f1 = utils.AverageMeter()
        exact_match = utils.AverageMeter()

        examples = 0
        bad_examples = 0
        for ex in dev_loader:
            ex_id, batch_size = ex[-1], ex[0].size(0)
            chosen_offset = ex[-2]
            pred_s, pred_e, _ = model.predict(ex)

            for i in range(batch_size):
                if pred_s[i][0] >= len(offsets[ex_id[i]]) or pred_e[i][0] >= len(offsets[ex_id[i]]):
                    bad_examples += 1
                    continue
                if args.use_sentence_selector:
                    s_offset = chosen_offset[i][pred_s[i][0]][0]
                    e_offset = chosen_offset[i][pred_e[i][0]][1]
                else:
                    s_offset = offsets[ex_id[i]][pred_s[i][0]][0]
                    e_offset = offsets[ex_id[i]][pred_e[i][0]][1]
                prediction = texts[ex_id[i]][s_offset:e_offset]

                predictions[ex_id[i]] = prediction

                ground_truths = answers[ex_id[i]]
                exact_match.update(utils.metric_max_over_ground_truths(
                    utils.exact_match_score, prediction, ground_truths))
                f1.update(utils.metric_max_over_ground_truths(
                    utils.f1_score, prediction, ground_truths))

            examples += batch_size

        logger.info('dev valid official for dev file %s : Epoch = %d | EM = %.2f | ' %
                    (dataset_file, global_stats['epoch'], exact_match.avg * 100) +
                    'F1 = %.2f | examples = %d | valid time = %.2f (s)' %
                    (f1.avg * 100, examples, eval_time.time()))

        orig_f1_score = 0.0
        orig_exact_match_score = 0.0
        adv_f1_scores = {}  # Map from original ID to F1 score
        adv_exact_match_scores = {}  # Map from original ID to exact match score
        adv_ids = {}
        all_ids = set()  # Set of all original IDs
        f1 = exact_match = 0
        dataset = json.load(open(dataset_file))['data']
        for article in dataset:
            for paragraph in article['paragraphs']:
                for qa in paragraph['qas']:
                    orig_id = qa['id'].split('-')[0]
                    all_ids.add(orig_id)
                    if qa['id'] not in predictions:
                        message = 'Unanswered question ' + qa['id'] + ' will receive score 0.'
                        # logger.info(message)
                        continue
                    ground_truths = list(map(lambda x: x['text'], qa['answers']))
                    prediction = predictions[qa['id']]
                    cur_exact_match = utils.metric_max_over_ground_truths(utils.exact_match_score,
                                                                    prediction, ground_truths)
                    cur_f1 = utils.metric_max_over_ground_truths(utils.f1_score, prediction, ground_truths)
                    if orig_id == qa['id']:
                        # This is an original example
                        orig_f1_score += cur_f1
                        orig_exact_match_score += cur_exact_match
                        if orig_id not in adv_f1_scores:
                            # Haven't seen adversarial example yet, so use original for adversary
                            adv_ids[orig_id] = orig_id
                            adv_f1_scores[orig_id] = cur_f1
                            adv_exact_match_scores[orig_id] = cur_exact_match
                    else:
                        # This is an adversarial example
                        if (orig_id not in adv_f1_scores or adv_ids[orig_id] == orig_id
                            or adv_f1_scores[orig_id] > cur_f1):
                            # Always override if currently adversary currently using orig_id
                            adv_ids[orig_id] = qa['id']
                            adv_f1_scores[orig_id] = cur_f1
                            adv_exact_match_scores[orig_id] = cur_exact_match
        orig_f1 = 100.0 * orig_f1_score / len(all_ids)
        orig_exact_match = 100.0 * orig_exact_match_score / len(all_ids)
        adv_exact_match = 100.0 * sum(adv_exact_match_scores.values()) / len(all_ids)
        adv_f1 = 100.0 * sum(adv_f1_scores.values()) / len(all_ids)
        logger.info("For the file %s Original Exact Match : %.4f ; Original F1 : : %.4f | "
                    % (dataset_file, orig_exact_match, orig_f1)
                    + "Adversarial Exact Match : %.4f ; Adversarial F1 : : %.4f " % (adv_exact_match, adv_f1))
예제 #4
0
def eval_end2end(args):
    out_file = args.out_file
    os.makedirs(os.path.dirname(out_file), exist_ok=True)
    prediction_file = args.prediction_file
    answer_file = args.answer_file
    match_fn = exact_match_score if args.no_regex else regex_match_score

    data_dir = os.path.dirname(prediction_file)

    model_file = args.model_file or os.path.join(
        data_dir, '{}.xgb'.format(args.classifier))
    bst = xgboost.Booster()
    bst.load_model(model_file)

    stop_count = 0
    stop_correct = 0
    processed = 0
    with open(out_file, 'w', encoding=ENCODING) as of:
        for answer_line, prediction_line in zip(
                open(answer_file, encoding=ENCODING),
                open(prediction_file, encoding=ENCODING)):
            answer_data = json.loads(answer_line)
            answer = [normalize(a) for a in answer_data['answer']]

            out_predictions = []

            all_spans = []
            all_a_scores = []
            all_a_zscores = []
            repeats = 0

            prediction = json.loads(prediction_line)

            for i, entry in enumerate(
                    sorted(prediction,
                           key=lambda k: k['doc_score'],
                           reverse=True)):
                out_predictions.append(entry)
                # doc_id = entry['doc_id']
                # start = int(entry['start'])
                # end = int(entry['end'])
                doc_score = entry['doc_score']
                ans_score = entry['span_score']
                span = entry['span']

                if span in all_spans:
                    repeats += 1

                all_spans.append(span)

                # Calculate sample z score (t statistic) for answer score
                if all_a_scores == [] or len(
                        all_a_scores
                ) == 1:  # dont use a_zscore feature at the beginning or if we only have 1
                    a_zscore = 0
                else:  # Take the sample mean of the previous ones, take zscore of the current with respect to that
                    #            sample_mean = np.mean(all_a_scores + [ans_score])
                    sample_mean = np.mean(all_a_scores)
                    #            sample_std = np.std(all_a_scores + [ans_score])
                    sample_std = np.std(all_a_scores)
                    if sample_std <= 0.0:
                        a_zscore = 0
                    else:
                        a_zscore = (ans_score - sample_mean) / sample_std

                all_a_zscores.append(a_zscore)
                max_zscore = max(all_a_zscores)

                # repeats_2 = 1 if repeats == 2 else 0
                # repeats_3 = 1 if repeats == 3 else 0
                # repeats_4 = 1 if repeats == 4 else 0
                # repeats_5 = 1 if repeats >= 5 else 0
                # past5 = 1 if i >= 5 else 0
                # past10 = 1 if i >= 10 else 0
                past20 = 1 if i >= 20 else 0
                x = [max_zscore, ans_score, doc_score, repeats, past20]
                feature_mat = xgboost.DMatrix(x)
                stop_prob = bst.predict(feature_mat)

                if stop_prob > args.stop_threshold:
                    if metric_max_over_ground_truths(match_fn, normalize(span),
                                                     answer):
                        stop_correct += 1
                    stop_count += 1
                    print(stop_prob, 'stopped at:', i + 1, stop_count,
                          processed)
                    break

            processed += 1
            of.write(json.dumps(out_predictions) + '\n')
            print('processed', stop_correct, stop_count, processed)
예제 #5
0
def process_record(data_line_, prediction_line_, neg_gap_, match_fn):
    records_ = []
    stop_count_ = 0
    data = json.loads(data_line_)
    # question = data['question']
    # q_id = slugify(question)

    answer = [normalize(a) for a in data['answer']]
    prediction = json.loads(prediction_line_)
    # MAKE SURE REVERSE IS TRUE
    ranked_prediction = sorted(prediction,
                               key=lambda k: k['doc_score'],
                               reverse=True)
    correct_rank = get_rank(prediction, answer, match_fn)
    if correct_rank > 150:
        #  if correct_rank < 50 or correct_rank > 150:
        return records_, stop_count_

    all_p_scores = []
    all_a_scores = []
    all_a_zscores = []
    all_spans = []
    repeats = 0
    for i, entry in enumerate(ranked_prediction):
        # doc_id = entry['doc_id']
        # start = int(entry['start'])
        # end = int(entry['end'])
        doc_score = entry['doc_score']
        ans_score = entry['span_score']
        span = entry['span']

        if span in all_spans:
            repeats += 1

        all_spans.append(span)

        # Calculate sample z score (t statistic) for answer score
        if all_a_scores == [] or len(
                all_a_scores
        ) == 1:  # dont use a_zscore feature at the beginning or if we only have 1
            a_zscore = 0
        else:  # Take the sample mean of the previous ones, take zscore of the current with respect to that
            #            sample_mean = np.mean(all_a_scores + [ans_score])
            sample_mean = np.mean(all_a_scores)
            #            sample_std = np.std(all_a_scores + [ans_score])
            sample_std = np.std(all_a_scores)
            if sample_std <= 0.0:
                a_zscore = 0
            else:
                a_zscore = (ans_score - sample_mean) / sample_std

        # THESE ARE FOR STATISTISTICS OVER ENTIRE DATA SET, IGNORE
        # all_doc_scores.append(doc_score)

        all_a_zscores.append(a_zscore)
        max_zscore = max(all_a_zscores)
        # corr_doc_score = (doc_score - DOC_MEAN) / DOC_STD
        # corr_ans_mean_score = (np.mean(all_a_scores + [ans_score]) - ANS_MEAN) / ANS_STD

        all_p_scores.append(doc_score)
        all_a_scores.append(ans_score)
        # corr_doc_score = (doc_score - DOC_MEAN) / DOC_STD

        record = OrderedDict()

        # record['a_zscore'] = a_zscore
        record['max_zscore'] = max_zscore
        record['corr_doc_score'] = doc_score
        repeats_2 = 1 if repeats == 2 else 0
        repeats_3 = 1 if repeats == 3 else 0
        repeats_4 = 1 if repeats == 4 else 0
        repeats_5 = 1 if repeats >= 5 else 0
        past20 = 1 if i >= 20 else 0
        # record['i'] = i
        record['repeats_2'] = repeats_2
        record['repeats_3'] = repeats_3
        record['repeats_4'] = repeats_4
        record['repeats_5'] = repeats_5
        record['past20'] = past20

        # record['prob_avg'] = sum(all_probs) / len(all_probs)
        # record['prob'] = prob
        record['repeats'] = repeats
        # record['ans_avg'] = corr_ans_mean_score
        # record['question'] = question

        #        if i + 1 == correct_rank:
        match = metric_max_over_ground_truths(match_fn, normalize(span),
                                              answer)
        # if i + 1 >= correct_rank:
        if match:
            record['stop'] = 1

            stop_count_ += 1
            # if stop_count_ > 10:
            #     should_return = True
            # else:
            #     should_return = False
            should_return = False
            write_record = True
            # if i % neg_gap_ == 0 or i + 1 == correct_rank:
            #     stop_count_ += 1
            #     write_record = True
            # else:
            #     write_record = False
            #
            # if i + 1 - correct_rank > 30:
            #     should_return = True
            # else:
            #     should_return = False
        else:
            should_return = False
            if i % neg_gap_ == 0:
                record['stop'] = 0
                write_record = True
            else:
                write_record = False
        if write_record:
            records_.append(record)
            # record_path = os.path.join(record_dir_, '%s_%s.pkl' % (q_id, doc_id))
            # with open(record_path, 'wb') as f:
            #     pk.dump(record, f)
        if should_return:
            return records_, stop_count_
    return records_, stop_count_