Python AverageMeter示例，drqa.reader.utils.AverageMeter Python示例

示例#1

0

显示文件

文件： train_newsqa.py 项目： bhargaviparanjape/DrQA

def validate_official(args, data_loader, model, global_stats, offsets, texts,
                      answers):
    """Run one full official validation. Uses exact spans and same
    exact match/F1 score computation as in the SQuAD script.

    Extra arguments:
        offsets: The character start/end indices for the tokens in each context.
        texts: Map of qid --> raw text of examples context (matches offsets).
        answers: Map of qid --> list of accepted answers.
    """
    clean_id_file = open(os.path.join(DATA_DIR, "clean_qids.txt"), "w+")
    eval_time = utils.Timer()
    f1 = utils.AverageMeter()
    exact_match = utils.AverageMeter()

    # Run through examples
    examples = 0
    bad_examples = 0
    for ex in data_loader:
        ex_id, batch_size = ex[-1], ex[0].size(0)
        chosen_offset = ex[-2]
        pred_s, pred_e, _ = model.predict(ex)

        for i in range(batch_size):
            if pred_s[i][0] >= len(offsets[ex_id[i]]) or pred_e[i][0] >= len(
                    offsets[ex_id[i]]):
                bad_examples += 1
                continue
            if args.use_sentence_selector:
                s_offset = chosen_offset[i][pred_s[i][0]][0]
                e_offset = chosen_offset[i][pred_e[i][0]][1]
            else:
                s_offset = offsets[ex_id[i]][pred_s[i][0]][0]
                e_offset = offsets[ex_id[i]][pred_e[i][0]][1]
            prediction = texts[ex_id[i]][s_offset:e_offset]

            # Compute metrics
            ground_truths = answers[ex_id[i]]
            exact_match.update(
                utils.metric_max_over_ground_truths(utils.exact_match_score,
                                                    prediction, ground_truths))
            f1.update(
                utils.metric_max_over_ground_truths(utils.f1_score, prediction,
                                                    ground_truths))

            f1_example = utils.metric_max_over_ground_truths(
                utils.f1_score, prediction, ground_truths)

            if f1_example != 0:
                clean_id_file.write(ex_id + "\n")

        examples += batch_size

    clean_id_file.close()
    logger.info('dev valid official: Epoch = %d | EM = %.2f | ' %
                (global_stats['epoch'], exact_match.avg * 100) +
                'F1 = %.2f | examples = %d | valid time = %.2f (s)' %
                (f1.avg * 100, examples, eval_time.time()))
    logger.info('Bad Offset Examples during official eval: %d' % bad_examples)
    return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}

示例#2

0

显示文件

def eval_accuracies_rc(pred_s, target_s, pred_e, target_e):
    """An unofficial evalutation helper.
    Compute exact start/end/complete match accuracies for a batch.
    """
    # Convert 1D tensors to lists of lists (compatibility)
    if torch.is_tensor(target_s):
        target_s = [[e] for e in target_s]
        target_e = [[e] for e in target_e]

    # Compute accuracies from targets
    batch_size = len(pred_s)
    start = utils.AverageMeter()
    end = utils.AverageMeter()
    em = utils.AverageMeter()
    for i in range(batch_size):
        # Start matches
        if pred_s[i] in target_s[i]:
            start.update(1)
        else:
            start.update(0)

        # End matches
        if pred_e[i] in target_e[i]:
            end.update(1)
        else:
            end.update(0)

        # Both start and end match
        if any([1 for _s, _e in zip(target_s[i], target_e[i])
                if _s == pred_s[i] and _e == pred_e[i]]):
            em.update(1)
        else:
            em.update(0)
    return start.avg * 100, end.avg * 100, em.avg * 100

示例#3

0

显示文件

文件： train.py 项目： windsurfer7563/SberBDrQAReader

def validate_unofficial(args, data_loader, model, global_stats, mode):
    """Run one full unofficial validation.
    Unofficial = doesn't use SQuAD script.
    """
    eval_time = utils.Timer()
    start_acc = utils.AverageMeter()
    end_acc = utils.AverageMeter()
    exact_match = utils.AverageMeter()

    # Make predictions
    examples = 0
    for ex in data_loader:
        batch_size = ex[0].size(0)
        pred_s, pred_e, _ = model.predict(ex)
        target_s, target_e = ex[-3:-1]

        # We get metrics for independent start/end and joint start/end
        accuracies = eval_accuracies(pred_s, target_s, pred_e, target_e)
        start_acc.update(accuracies[0], batch_size)
        end_acc.update(accuracies[1], batch_size)
        exact_match.update(accuracies[2], batch_size)

        # If getting train accuracies, sample max 10k
        examples += batch_size
        if mode == 'train' and examples >= 1e4:
            break

    logger.info('%s valid unofficial: Epoch = %d | start = %.2f | ' %
                (mode, global_stats['epoch'], start_acc.avg) +
                'end = %.2f | exact = %.2f | examples = %d | ' %
                (end_acc.avg, exact_match.avg, examples) +
                'valid time = %.2f (s)' % eval_time.time())

    return {'exact_match': exact_match.avg}

示例#4

0

显示文件

def validate_unofficial(args, data_loader, model, global_stats, mode):
    """Run one full unofficial validation.
    Unofficial = doesn't use SQuAD script.
    """
    from sklearn.metrics import roc_auc_score, f1_score
    eval_time = utils.Timer()
    trigger_acc = utils.AverageMeter()
    eval_time = utils.Timer()
    start_acc = utils.AverageMeter()
    end_acc = utils.AverageMeter()
    exact_match = utils.AverageMeter()

    # Make predictions
    all_pred = []
    all_pred_label = []
    all_gt = []
    examples = 0
    for ex in data_loader:
        batch_size = ex[0].size(0)

        pred_score, pred_label, pred_s, pred_e = model.predict(ex)

        target_s, target_e = ex[-4:-2]

        accuracies = eval_accuracies_rc(pred_s, target_s, pred_e, target_e)
        start_acc.update(accuracies[0], batch_size)
        end_acc.update(accuracies[1], batch_size)
        exact_match.update(accuracies[2], batch_size)

        gt_label = ex[-1]
        all_pred.extend([x[1] for x in pred_score])
        all_gt.extend(gt_label)
        all_pred_label.extend(pred_label)
        # We get metrics for independent start/end and joint start/end
        accuracies = eval_accuracies(pred_label, gt_label)
        trigger_acc.update(accuracies, batch_size)

        # If getting train accuracies, sample max 10k
        examples += batch_size
        # only test train top 10000
        if mode == 'train' and examples >= 1e4:
            break
    auc_score = roc_auc_score(all_gt, all_pred)
    f1_scores = f1_score(all_gt, all_pred_label,average=None)

    logger.info('%s valid unofficial: Epoch = %d | ' %
                (mode, global_stats['epoch'], ) +
                'neg_f1 = %.2f | pos_f1 = %.2f |  trigger_auc = %.2f | trigger_acc = %.2f | examples = %d | ' %
                (f1_scores[0], f1_scores[1], auc_score, trigger_acc.avg, examples) +
                'valid time = %.2f (s)' % eval_time.time())

    return {'auc': auc_score, 'trigger_acc': trigger_acc.avg}

示例#5

0

显示文件

def validate_official(args,
                      data_loader,
                      model,
                      global_stats,
                      offsets,
                      texts,
                      answers,
                      mode="dev"):
    """Run one full official validation. Uses exact spans and same
    exact match/F1 score computation as in the SQuAD script.

    Extra arguments:
        offsets: The character start/end indices for the tokens in each context.
        texts: Map of qid --> raw text of examples context (matches offsets).
        answers: Map of qid --> list of accepted answers.
    """
    eval_time = utils.Timer()
    f1 = utils.AverageMeter()
    exact_match = utils.AverageMeter()

    # Run through examples
    examples = 0
    for ex in data_loader:
        ex_id, batch_size = ex[-1], ex[0].size(0)
        pred_s, pred_e, _ = model.predict(ex)

        for i in range(batch_size):
            s_offset = offsets[ex_id[i]][pred_s[i][0]][0]
            e_offset = offsets[ex_id[i]][pred_e[i][0]][1]
            prediction = texts[ex_id[i]][s_offset:e_offset]

            # Compute metrics
            ground_truths = answers[ex_id[i]]
            exact_match.update(
                utils.metric_max_over_ground_truths(utils.exact_match_score,
                                                    prediction, ground_truths))
            f1.update(
                utils.metric_max_over_ground_truths(utils.f1_score, prediction,
                                                    ground_truths))

        examples += batch_size

    logger.info(mode + ' valid official: Epoch = %d | EM = %.2f | ' %
                (global_stats['epoch'], exact_match.avg * 100) +
                'F1 = %.2f | examples = %d | valid time = %.2f (s)' %
                (f1.avg * 100, examples, eval_time.time()))

    return {'exact_match': exact_match.avg * 100, 'f1': f1.avg * 100}

示例#6

0

显示文件

def train(args, data_loader, model, global_stats):
    """Run through one epoch of model training with the provided data loader."""
    # Initialize meters + timers
    train_loss = utils.AverageMeter()
    epoch_time = utils.Timer()

    # Run one epoch
    for idx, ex in enumerate(data_loader):
        '''
        print('ex:::: ' + str(ex))
        sys.exit()
        '''
        train_loss.update(*model.update(ex))

        if idx % args.display_iter == 0:
            logger.info('train: Epoch = %d | iter = %d/%d | ' %
                        (global_stats['epoch'], idx, len(data_loader)) +
                        'loss = %.2f | elapsed time = %.2f (s)' %
                        (train_loss.avg, global_stats['timer'].time()))
            train_loss.reset()

    logger.info('train: Epoch %d done. Time for epoch = %.2f (s)' %
                (global_stats['epoch'], epoch_time.time()))

    # Checkpoint
    if args.checkpoint:
        model.checkpoint(args.model_file + '.checkpoint',
                         global_stats['epoch'] + 1)

示例#7

0

显示文件

文件： train.py 项目： himanshu746/DrQA

def train(args, data_loader, data_loader_source, data_loader_target,
          train_loader_source_Q, train_loader_target_Q, model, global_stats):
    """Run through one epoch of model training with the provided data loader."""
    # Initialize meters + timers
    train_loss = utils.AverageMeter()
    epoch_time = utils.Timer()

    # Run one epoch
    for idx, ex in enumerate(data_loader_source):

        # Calculate n_critic
        epoch = global_stats['epoch']
        n_critic = args.n_critic
        if n_critic > 0 and ((epoch == 0 and idx <= 25) or (idx % 500 == 0)):
            n_critic = 10

        train_loss.update(*model.update(ex, n_critic, epoch))

        if idx % args.display_iter == 0:
            logger.info('train: Epoch = %d | iter = %d/%d | ' %
                        (global_stats['epoch'], idx, len(data_loader)) +
                        'loss = %.2f | elapsed time = %.2f (s)' %
                        (train_loss.avg, global_stats['timer'].time()))
            train_loss.reset()

    logger.info('train: Epoch %d done. Time for epoch = %.2f (s)' %
                (global_stats['epoch'], epoch_time.time()))

    # Checkpoint
    if args.checkpoint:
        model.checkpoint(args.model_file + '.checkpoint',
                         global_stats['epoch'] + 1)

示例#8

0

显示文件

def train(args, data_loader, model, global_stats):
    """Run through one epoch of model training with the provided data loader."""
    # Initialize meters + timers
    train_loss = utils.AverageMeter()
    epoch_time = utils.Timer()

    # Run one epoch
    for idx, ex in enumerate(data_loader):
        train_loss.update(*model.update(ex))

        if idx % args.display_iter == 0:
            logger.info('train: Epoch = %d | iter = %d/%d | ' %
                        (global_stats['epoch'], idx, len(data_loader)) +
                        'loss = %.2f | elapsed time = %.2f (s)' %
                        (train_loss.avg, global_stats['timer'].time()))
            train_loss.reset()

        if args.indexcheckpoint != -1 and idx != 0 and idx % args.indexcheckpoint == 0:
            checkpointName = args.model_file + str(
                idx / args.indexcheckpoint) + ':' + str(
                    global_stats['epoch']) + '.checkpoint'
            model.checkpoint(checkpointName, global_stats['epoch'])
            print(['new checkpoint at : %s' % checkpointName])

    logger.info('train: Epoch %d done. Time for epoch = %.2f (s)' %
                (global_stats['epoch'], epoch_time.time()))

    # Checkpoint
    if args.checkpoint:
        model.checkpoint(args.model_file + '.checkpoint',
                         global_stats['epoch'] + 1)

示例#9

0

显示文件

def eval_accuracies(pred_label, gt_label):
    """An unofficial evalutation helper.
    Compute exact start/end/complete match accuracies for a batch.
    """
    trigger_acc = utils.AverageMeter()
    for i in range(len(pred_label)):
        trigger_acc.update(1 if pred_label[i]==gt_label[i] else 0)
    return trigger_acc.avg * 100

示例#10

0

显示文件

def train(args, data_loader, model, global_stats, dev_loader):
    """Run through one epoch of model training with the provided data loader."""
    # Initialize meters + timers
    train_loss = utils.AverageMeter()
    epoch_time = utils.Timer()

    # Run one epoch
    for idx, ex in enumerate(data_loader):
        train_loss.update(*model.update(ex))
        global_stats['Loss_Train'] = float(train_loss.avg)
        if idx % args.display_iter == 0:
            logger.info('train: Epoch = %d | iter = %d/%d | ' %
                        (global_stats['epoch'], idx, len(data_loader)) +
                        'loss = %.2f | elapsed time = %.2f (s)' %
                        (train_loss.avg, global_stats['timer'].time()))
            train_loss.reset()

    ####### Fix this later
    if args.show_dev_loss:
        dev_loss = utils.AverageMeter()
        for idx, ex in enumerate(dev_loader):
            if args.cuda:
                inputs = [
                    e if e is None else Variable(e.cuda(async=True))
                    for e in ex[:5]
                ]
                target_s = Variable(ex[5].cuda(async=True))
                target_e = Variable(ex[6].cuda(async=True))
            else:
                print("No cudaaa")
                inputs = [e if e is None else e for e in ex[:5]]
                target_s = ex[5]
                target_e = ex[6]
            score_s, score_e = model.network(*inputs)
            loss = F.nll_loss(score_s, target_s) + F.nll_loss(
                score_e, target_e)
            dev_loss.update(loss.data[0], ex[0].size(0))

示例#11

0

显示文件

def eval_accuracies(pred_s, target_s):
    """An unofficial evalutation helper.
    Compute exact start/end/complete match accuracies for a batch.
    """
    # Convert 1D tensors to lists of lists (compatibility)
    if torch.is_tensor(target_s):
        target_s = [[e] for e in target_s]
        #target_e = [[e] for e in target_e]

    #print('pred_s: ' + str(pred_s) + ' target_s: ' + str(target_s))
    # Compute accuracies from targets
    batch_size = len(pred_s)
    start = utils.AverageMeter()
    #end = utils.AverageMeter()
    em = utils.AverageMeter()
    for i in range(batch_size):
        # Start matches
        #print('type: ' + str(type(pred_s[i])) + 'pred_s[i]: ' + str(pred_s[i]))
        #print('type: ' + str(type(target_s[i])) + 'target_s[i]: ' + str(target_s[i]))
        if pred_s[i] in target_s[i]:
            start.update(1)
        else:
            start.update(0)
        '''
        # End matches
        if pred_e[i] in target_e[i]:
            end.update(1)
        else:
            end.update(0)
        '''

        # Both start and end match
        if any([1 for _s in target_s[i] if _s == pred_s[i]]):
            em.update(1)
        else:
            em.update(0)
    return start.avg * 100, em.avg * 100

示例#12

0

显示文件

文件： train.py 项目： rahular/ellipsis-baselines

def train(args, data_loader, model, global_stats):
    """Run through one epoch of model training with the provided data loader."""
    # Initialize meters + timers
    train_loss = utils.AverageMeter()
    epoch_time = utils.Timer()

    # Run one epoch
    for idx, ex in enumerate(data_loader):
        train_loss.update(*model.update(ex))

        if idx % args.display_iter == 0:
            logger.info("train: Epoch = %d | iter = %d/%d | " %
                        (global_stats["epoch"], idx, len(data_loader)) +
                        "loss = %.2f | elapsed time = %.2f (s)" %
                        (train_loss.avg, global_stats["timer"].time()))
            train_loss.reset()

    logger.info("train: Epoch %d done. Time for epoch = %.2f (s)" %
                (global_stats["epoch"], epoch_time.time()))

    # Checkpoint
    if args.checkpoint:
        model.checkpoint(args.model_file + ".checkpoint",
                         global_stats["epoch"] + 1)

示例#13

0

显示文件

def validate_adversarial(args, model, global_stats, mode="dev"):
    # create dataloader for dev sets, load thier jsons, integrate the function

    for idx, dataset_file in enumerate(args.adv_dev_json):

        predictions = {}

        logger.info("Validating Adversarial Dataset %s" % dataset_file)
        exs = utils.load_data(args, args.adv_dev_file[idx])
        logger.info('Num dev examples = %d' % len(exs))
        ## Create dataloader
        dev_dataset = reader_data.ReaderDataset(exs,
                                                model,
                                                single_answer=False)
        if args.sort_by_len:
            dev_sampler = reader_data.SortedBatchSampler(dev_dataset.lengths(),
                                                         args.test_batch_size,
                                                         shuffle=False)
        else:
            dev_sampler = torch.utils.data.sampler.SequentialSampler(
                dev_dataset)
        if args.use_sentence_selector:
            dev_batcher = reader_vector.sentence_batchifier(
                model, single_answer=False)
            #batching_function = dev_batcher.batchify
            batching_function = reader_vector.batchify
        else:
            batching_function = reader_vector.batchify
        dev_loader = torch.utils.data.DataLoader(
            dev_dataset,
            batch_size=args.test_batch_size,
            sampler=dev_sampler,
            num_workers=args.data_workers,
            collate_fn=batching_function,
            pin_memory=args.cuda,
        )

        texts = utils.load_text(dataset_file)
        offsets = {ex['id']: ex['offsets'] for ex in exs}
        answers = utils.load_answers(dataset_file)

        eval_time = utils.Timer()
        f1 = utils.AverageMeter()
        exact_match = utils.AverageMeter()

        examples = 0
        bad_examples = 0
        for ex in dev_loader:
            ex_id, batch_size = ex[-1], ex[0].size(0)
            chosen_offset = ex[-2]
            pred_s, pred_e, _ = model.predict(ex)

            for i in range(batch_size):
                if pred_s[i][0] >= len(
                        offsets[ex_id[i]]) or pred_e[i][0] >= len(
                            offsets[ex_id[i]]):
                    bad_examples += 1
                    continue
                if args.use_sentence_selector:
                    s_offset = chosen_offset[i][pred_s[i][0]][0]
                    e_offset = chosen_offset[i][pred_e[i][0]][1]
                else:
                    s_offset = offsets[ex_id[i]][pred_s[i][0]][0]
                    e_offset = offsets[ex_id[i]][pred_e[i][0]][1]
                prediction = texts[ex_id[i]][s_offset:e_offset]

                if args.select_k > 1:
                    prediction = ""
                    offset_subset = chosen_offset[i][pred_s[i][0]:pred_e[i][0]]
                    for enum_, o in enumerate(offset_subset):
                        prediction += texts[ex_id[i]][o[0]:o[1]] + " "
                    prediction = prediction.strip()

                predictions[ex_id[i]] = prediction

                ground_truths = answers[ex_id[i]]
                exact_match.update(
                    utils.metric_max_over_ground_truths(
                        utils.exact_match_score, prediction, ground_truths))
                f1.update(
                    utils.metric_max_over_ground_truths(
                        utils.f1_score, prediction, ground_truths))

            examples += batch_size

        logger.info(
            'dev valid official for dev file %s : Epoch = %d | EM = %.2f | ' %
            (dataset_file, global_stats['epoch'], exact_match.avg * 100) +
            'F1 = %.2f | examples = %d | valid time = %.2f (s)' %
            (f1.avg * 100, examples, eval_time.time()))

        orig_f1_score = 0.0
        orig_exact_match_score = 0.0
        adv_f1_scores = {}  # Map from original ID to F1 score
        adv_exact_match_scores = {
        }  # Map from original ID to exact match score
        adv_ids = {}
        all_ids = set()  # Set of all original IDs
        f1 = exact_match = 0
        dataset = json.load(open(dataset_file))['data']
        for article in dataset:
            for paragraph in article['paragraphs']:
                for qa in paragraph['qas']:
                    orig_id = qa['id'].split('-')[0]
                    all_ids.add(orig_id)
                    if qa['id'] not in predictions:
                        message = 'Unanswered question ' + qa[
                            'id'] + ' will receive score 0.'
                        # logger.info(message)
                        continue
                    ground_truths = list(
                        map(lambda x: x['text'], qa['answers']))
                    prediction = predictions[qa['id']]
                    cur_exact_match = utils.metric_max_over_ground_truths(
                        utils.exact_match_score, prediction, ground_truths)
                    cur_f1 = utils.metric_max_over_ground_truths(
                        utils.f1_score, prediction, ground_truths)
                    if orig_id == qa['id']:
                        # This is an original example
                        orig_f1_score += cur_f1
                        orig_exact_match_score += cur_exact_match
                        if orig_id not in adv_f1_scores:
                            # Haven't seen adversarial example yet, so use original for adversary
                            adv_ids[orig_id] = orig_id
                            adv_f1_scores[orig_id] = cur_f1
                            adv_exact_match_scores[orig_id] = cur_exact_match
                    else:
                        # This is an adversarial example
                        if (orig_id not in adv_f1_scores
                                or adv_ids[orig_id] == orig_id
                                or adv_f1_scores[orig_id] > cur_f1):
                            # Always override if currently adversary currently using orig_id
                            adv_ids[orig_id] = qa['id']
                            adv_f1_scores[orig_id] = cur_f1
                            adv_exact_match_scores[orig_id] = cur_exact_match
        orig_f1 = 100.0 * orig_f1_score / len(all_ids)
        orig_exact_match = 100.0 * orig_exact_match_score / len(all_ids)
        adv_exact_match = 100.0 * sum(
            adv_exact_match_scores.values()) / len(all_ids)
        adv_f1 = 100.0 * sum(adv_f1_scores.values()) / len(all_ids)
        logger.info(
            "For the file %s Original Exact Match : %.4f ; Original F1 : : %.4f | "
            % (dataset_file, orig_exact_match, orig_f1) +
            "Adversarial Exact Match : %.4f ; Adversarial F1 : : %.4f " %
            (adv_exact_match, adv_f1))

示例#14

0

显示文件

文件： train.py 项目： Levstyle/SQuAD_models

def validate_official(args, data_loader, model, global_stats, offsets, texts,
                      questions, answers):
    """Run one full official validation. Uses exact spans and same
    exact match/F1 score computation as in the SQuAD script.

    Extra arguments:
        offsets: The character start/end indices for the tokens in each context.
        texts: Map of qid --> raw text of examples context (matches offsets).
        answers: Map of qid --> list of accepted answers.
    """
    eval_time = utils.Timer()
    f1 = utils.AverageMeter()
    exact_match = utils.AverageMeter()

    # Run through examples
    examples = 0
    em_false = {}  # cid -> (context, [(qid, question, answer)...])
    predictions = {}  # qid -> prediction
    for ex in data_loader:
        ex_id, batch_size = ex[-1], ex[0].size(0)
        pred_s, pred_e, _ = model.predict(ex)

        for i in range(batch_size):
            s_offset = offsets[ex_id[i]][pred_s[i][0]][0]
            e_offset = offsets[ex_id[i]][pred_e[i][0]][1]
            prediction = texts[ex_id[i]][1][s_offset:e_offset]
            cid = texts[ex_id[i]][0]
            predictions[ex_id[i]] = prediction

            # Compute metrics
            ground_truths = answers[ex_id[i]]

            em_score = utils.metric_max_over_ground_truths(
                utils.exact_match_score, prediction, ground_truths)
            if em_score < 1:
                if cid not in em_false:
                    em_false[cid] = {
                        'text':
                        texts[ex_id[i]][1],
                        'qa': [{
                            'qid': ex_id[i],
                            'question': questions[ex_id[i]],
                            'answers': answers[ex_id[i]],
                            'prediction': prediction
                        }]
                    }
                else:
                    em_false[cid]['qa'].append({
                        'qid': ex_id[i],
                        'question': questions[ex_id[i]],
                        'answers': answers[ex_id[i]],
                        'prediction': prediction
                    })

            exact_match.update(em_score)
            f1.update(
                utils.metric_max_over_ground_truths(utils.f1_score, prediction,
                                                    ground_truths))

        examples += batch_size

    logger.info('dev valid official: Epoch = %d | EM = %.2f | ' %
                (global_stats['epoch'], exact_match.avg * 100) +
                'F1 = %.2f | examples = %d | valid time = %.2f (s)' %
                (f1.avg * 100, examples, eval_time.time()))

    return {
        'exact_match': exact_match.avg * 100,
        'f1': f1.avg * 100
    }, em_false, predictions