Пример #1
0
def main():
    log.info('[program starts.]')
    checkpoint = torch.load(args.model)
    opt = checkpoint['config']
    opt['task_name'] = 'CoQA'
    opt['cuda'] = args.cuda
    opt['seed'] = args.seed
    opt['mask_prev_ans'] = args.mask_prev_ans
    opt['no_yes_no'] = args.no_yes_no
    opt['remove_indicator'] = args.remove_indicator
    if opt.get('do_hierarchical_query') is None:
        opt['do_hierarchical_query'] = False
    state_dict = checkpoint['state_dict']
    log.info('[model loaded.]')

    test, test_embedding = load_dev_data(opt)
    model = QAModel(opt, state_dict=state_dict)
    CoQAEval = CoQAEvaluator(os.path.join(args.dev_dir, 'dev.json'))
    log.info('[Data loaded.]')

    model.setup_eval_embed(test_embedding)

    if args.cuda:
        model.cuda()

    batches = BatchGen_CoQA(test,
                            batch_size=args.batch_size,
                            evaluation=True,
                            gpu=args.cuda,
                            dialog_ctx=opt['explicit_dialog_ctx'],
                            precompute_elmo=16 // args.batch_size)
    sample_idx = random.sample(range(len(batches)), args.show)

    with open(os.path.join(args.dev_dir, 'dev.json'), "r",
              encoding="utf8") as f:
        dev_data = json.load(f)

    list_of_ids = []
    for article in dev_data['data']:
        id = article["id"]
        for Qs in article["questions"]:
            tid = Qs["turn_id"]
            list_of_ids.append((id, tid))

    gradients = []
    for i, batch in enumerate(batches):
        grad = model.calc_grad(batch)
        gradients.append(grad.cpu())
        if i > 10:
            break

    output_path = os.path.join(args.output_dir, 'grad.pkl')
    print('Saving to {}...'.format(output_path))
    with open(output_path, 'wb') as f:
        pickle.dump(gradients, f)
Пример #2
0
def judge_yesno(gold_list):
    yesno_dict = Counter()
    for gold in gold_list:
        norm_text = CoQAEvaluator.normalize_answer(gold)
        if norm_text == 'yes':
            yesno_dict['y'] += 1
        elif norm_text == 'no':
            yesno_dict['n'] += 1
        elif norm_text == 'unknown':
            yesno_dict['u'] += 1
        else:
            yesno_dict['x'] += 1
    return yesno_dict.most_common(1)[0][0]
Пример #3
0
import numpy as np

parser = argparse.ArgumentParser()
parser.add_argument('--predict_file')
parser.add_argument('--dev_file')
args = parser.parse_args()

opt = vars(args)

with open(opt['predict_file'], 'r') as f:
    predictions = json.load(f)

with open(opt['dev_file'], 'r') as f:
    dev = json.load(f)

evaluator = CoQAEvaluator(dev)

span_start_dis_cnt = Counter()
span_end_dis_cnt = Counter()
wrong_type = 0

dis_diff_cnt = dict()
f1_length = dict()

matching_more_set = []

for data in predictions:
    spans = data['spans']
    truth_s = spans[0]
    truth_e = spans[1]
    predict_s = spans[2]
Пример #4
0
def main():
    log.info('[program starts.]')
    checkpoint = torch.load(args.model)
    opt = checkpoint['config']
    opt['task_name'] = 'CoQA'
    opt['cuda'] = args.cuda
    opt['seed'] = args.seed
    if opt.get('do_hierarchical_query') is None:
        opt['do_hierarchical_query'] = False
    state_dict = checkpoint['state_dict']
    log.info('[model loaded.]')

    test, test_embedding = load_dev_data(opt)
    model = QAModel(opt, state_dict=state_dict)
    CoQAEval = CoQAEvaluator("CoQA/coqa-dev-v1.0.json")
    log.info('[Data loaded.]')

    model.setup_eval_embed(test_embedding)

    if args.cuda:
        model.cuda()

    batches = BatchGen_CoQA(test,
                            batch_size=args.batch_size,
                            evaluation=True,
                            gpu=args.cuda,
                            dialog_ctx=opt['explicit_dialog_ctx'],
                            precompute_elmo=16 // args.batch_size)
    sample_idx = random.sample(range(len(batches)), args.show)

    with open("CoQA/coqa-dev-v1.0.json", "r", encoding="utf8") as f:
        dev_data = json.load(f)

    list_of_ids = []
    for article in dev_data['data']:
        id = article["id"]
        for Qs in article["questions"]:
            tid = Qs["turn_id"]
            list_of_ids.append((id, tid))

    predictions = []
    for i, batch in enumerate(batches):
        prediction = model.predict(batch)
        predictions.extend(prediction)

        if not (i in sample_idx):
            continue

        print("Story: ", batch[-4][0])
        for j in range(len(batch[-2][0])):
            print("Q: ", batch[-2][0][j])
            print("A: ", prediction[j])
            print("Gold A: ", batch[-1][0][j])
            print("---")
        print("")

    assert (len(list_of_ids) == len(predictions))
    official_predictions = []
    for ids, pred in zip(list_of_ids, predictions):
        official_predictions.append({
            "id": ids[0],
            "turn_id": ids[1],
            "answer": pred
        })
    with open("model_prediction.json", "w", encoding="utf8") as f:
        json.dump(official_predictions, f)

    f1 = CoQAEval.compute_turn_score_seq(predictions)
    log.warning("Test F1: {:.3f}".format(f1 * 100.0))
Пример #5
0
    for article in data:
        if turn_id > len(article['questions']):
            continue
        gold = [article['answers'][turn_id - 1]['input_text']]
        gold += [
            article['additional_answers'][key][turn_id - 1]['input_text']
            for key in article['additional_answers']
        ]
        golds.append(gold)

        pds.append(preds[article['id']][turn_id])

        print('gold_answer: %s\nprediction: %s\n' % (gold, pds[-1]))

    evals[turn] = dict()
    evals[turn]['F1'] = CoQAEvaluator.compute_turn_score_seq(golds, pds)

    output = get_yesno_recall_precision(pds, golds)

    evals[turn].update(output)
    # print(json.dumps(evals[turn_id], indent=4))

    predicted_answers.extend(pds)
    gold_answers.extend(golds)

print(
    '=============================Single Turn Evaluation=========================================='
)

print(json.dumps(evals, indent=4))
Пример #6
0
def main():
    log.info('[program starts.]')
    opt = vars(args)  # changing opt will change args
    train, train_embedding, opt = load_train_data(opt)
    dev, dev_embedding = load_dev_data(opt)
    opt['num_features'] += args.explicit_dialog_ctx * 3  # dialog_act + previous answer
    if opt['use_elmo'] == False:
        opt['elmo_batch_size'] = 0
    CoQAEval = CoQAEvaluator("CoQA/coqa-dev-v1.0.json")
    log.info('[Data loaded.]')

    if args.resume:
        log.info('[loading previous model...]')
        checkpoint = torch.load(args.resume)
        if args.resume_options:
            opt = checkpoint['config']
        state_dict = checkpoint['state_dict']
        model = QAModel(opt, train_embedding, state_dict)
        epoch_0 = checkpoint['epoch'] + 1
        for i in range(checkpoint['epoch']):
            random.shuffle(list(range(len(train))))  # synchronize random seed
        if args.reduce_lr:
            lr_decay(model.optimizer, lr_decay=args.reduce_lr)
    else:
        model = QAModel(opt, train_embedding)
        epoch_0 = 1

    if args.pretrain:
        pretrain_model = torch.load(args.pretrain)
        state_dict = pretrain_model['state_dict']['network']

        model.get_pretrain(state_dict)

    model.setup_eval_embed(dev_embedding)
    log.info("[dev] Total number of params: {}".format(model.total_param))

    if args.cuda:
        model.cuda()

    if args.resume:
        batches = BatchGen_CoQA(dev,
                                batch_size=args.batch_size,
                                evaluation=True,
                                gpu=args.cuda,
                                dialog_ctx=args.explicit_dialog_ctx)
        predictions = []
        for batch in batches:
            phrases, noans = model.predict(batch)
            predictions.extend(phrases)
        f1 = CoQAEval.compute_turn_score_seq(predictions)
        log.info("[dev F1: {:.3f}]".format(f1))
        best_val_score = f1
    else:
        best_val_score = 0.0

    for epoch in range(epoch_0, epoch_0 + args.epoches):
        log.warning('Epoch {}'.format(epoch))

        # train
        batches = BatchGen_CoQA(train,
                                batch_size=args.batch_size,
                                gpu=args.cuda,
                                dialog_ctx=args.explicit_dialog_ctx,
                                precompute_elmo=args.elmo_batch_size //
                                args.batch_size)
        start = datetime.now()
        for i, batch in enumerate(batches):
            model.update(batch)
            if i % args.log_per_updates == 0:
                log.info(
                    'updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format(
                        model.updates, model.train_loss.avg,
                        str((datetime.now() - start) / (i + 1) *
                            (len(batches) - i - 1)).split('.')[0]))

        # eval
        if epoch % args.eval_per_epoch == 0:
            batches = BatchGen_CoQA(dev,
                                    batch_size=args.batch_size,
                                    evaluation=True,
                                    gpu=args.cuda,
                                    dialog_ctx=args.explicit_dialog_ctx,
                                    precompute_elmo=args.elmo_batch_size //
                                    args.batch_size)
            predictions = []
            for batch in batches:
                phrases = model.predict(batch)
                predictions.extend(phrases)
            f1 = CoQAEval.compute_turn_score_seq(predictions)

        # save
        if args.save_best_only:
            if f1 > best_val_score:
                best_val_score = f1
                model_file = os.path.join(model_dir, 'best_model.pt')
                model.save(model_file, epoch)
                log.info('[new best model saved.]')
        else:
            model_file = os.path.join(model_dir,
                                      'checkpoint_epoch_{}.pt'.format(epoch))
            model.save(model_file, epoch)
            if f1 > best_val_score:
                best_val_score = f1
                copyfile(os.path.join(model_dir, model_file),
                         os.path.join(model_dir, 'best_model.pt'))
                log.info('[new best model saved.]')

        log.warning("Epoch {} - dev F1: {:.3f} (Best F1: {:.3f})".format(
            epoch, f1 * 100.0, best_val_score * 100.0))
Пример #7
0
def main():
    log.info(
        "\n\n\nScores on the dev set are meaningless.\nDev data has been added to train data for pretraining.\nDo not use this script to train CoQA.\n\n\n"
    )
    log.info('[program starts.]')
    log.info('seed: {}'.format(args.seed))
    log.info(str(vars(args)))
    opt = vars(args)  # changing opt will change args
    train, train_embedding, opt = load_train_data(opt)
    dev, dev_embedding = load_dev_data(opt)
    opt['num_features'] += args.explicit_dialog_ctx * 3  # dialog_act + previous answer
    if opt['use_elmo'] == False:
        opt['elmo_batch_size'] = 0
    CoQAEval = CoQAEvaluator("CoQA/dev.json")
    log.info('[Data loaded.]')

    if args.resume:
        log.info('[loading previous model...]')
        if args.cuda:
            checkpoint = torch.load(args.resume,
                                    map_location={'cpu': 'cuda:0'})
        else:
            checkpoint = torch.load(args.resume,
                                    map_location={'cuda:0': 'cpu'})
        if args.resume_options:
            opt = checkpoint['config']
        state_dict = checkpoint['state_dict']
        model = QAModel(opt, train_embedding, state_dict)
        epoch_0 = checkpoint['epoch'] + 1
        for i in range(checkpoint['epoch']):
            random.shuffle(list(range(len(train))))  # synchronize random seed
        if args.reduce_lr:
            lr_decay(model.optimizer, lr_decay=args.reduce_lr)
    else:
        model = QAModel(opt, train_embedding)
        epoch_0 = 1

    if args.pretrain:
        pretrain_model = torch.load(args.pretrain)
        state_dict = pretrain_model['state_dict']['network']

        model.get_pretrain(state_dict)

    model.setup_eval_embed(dev_embedding)
    log.info("[dev] Total number of params: {}".format(model.total_param))

    if args.cuda:
        model.cuda()

    if args.resume:
        batches = BatchGen_CoQA(dev,
                                batch_size=args.batch_size,
                                evaluation=True,
                                gpu=args.cuda,
                                dialog_ctx=args.explicit_dialog_ctx,
                                use_bert=args.use_bert)
        predictions = []
        for batch in batches:
            if batch is None:
                continue
            phrases, noans = model.predict(batch)
            predictions.extend(phrases)
        f1 = CoQAEval.compute_turn_score_seq(predictions)
        log.info("[dev F1: {:.3f}]".format(f1))
        best_val_score = f1
    else:
        best_val_score = 0.0

    aggregate_grad_steps = 1
    if opt['use_bert']:
        aggregate_grad_steps = opt['aggregate_grad_steps']

    for epoch in range(epoch_0, epoch_0 + args.epoches):
        log.warning('Epoch {}'.format(epoch))

        # train
        batches = BatchGen_CoQA(train,
                                batch_size=args.batch_size,
                                gpu=args.cuda,
                                dialog_ctx=args.explicit_dialog_ctx,
                                precompute_elmo=args.elmo_batch_size //
                                args.batch_size,
                                use_bert=args.use_bert)
        start = datetime.now()
        total_batches = len(batches)
        loss = 0
        model.optimizer.zero_grad()
        if opt['finetune_bert']:
            model.bertadam.zero_grad()

        for i, batch in enumerate(batches):
            if batch is None:
                continue
            model.update(batch)
            if (i + 1) % aggregate_grad_steps == 0 or total_batches == (i + 1):
                # Update the gradients
                model.take_step()
                loss = 0
            if i % args.log_per_updates == 0:
                log.info(
                    'updates[{0:6}] train loss[{1:.5f}] remaining[{2}]'.format(
                        model.updates, model.train_loss.avg,
                        str((datetime.now() - start) / (i + 1) *
                            (len(batches) - i - 1)).split('.')[0]))

        # eval
        if epoch % args.eval_per_epoch == 0:
            batches = BatchGen_CoQA(dev,
                                    batch_size=args.batch_size,
                                    evaluation=True,
                                    gpu=args.cuda,
                                    dialog_ctx=args.explicit_dialog_ctx,
                                    precompute_elmo=args.elmo_batch_size //
                                    args.batch_size,
                                    use_bert=args.use_bert)
            predictions = []
            for batch in batches:
                if batch is None:
                    continue
                phrases = model.predict(batch)
                predictions.extend(phrases)
            f1 = CoQAEval.compute_turn_score_seq(predictions)

        # save
        if args.save_best_only:
            if f1 > best_val_score:
                best_val_score = f1
                model_file = os.path.join(model_dir, 'best_model.pt')
                model.save(model_file, epoch)
                log.info('[new best model saved.]')
        else:
            model_file = os.path.join(model_dir,
                                      'checkpoint_epoch_{}.pt'.format(epoch))
            model.save(model_file, epoch)
            if f1 > best_val_score:
                best_val_score = f1
                copyfile(os.path.join(model_dir, model_file),
                         os.path.join(model_dir, 'best_model.pt'))
                log.info('[new best model saved.]')

        log.warning("Epoch {} - dev F1: {:.3f} (Best F1: {:.3f})".format(
            epoch, f1 * 100.0, best_val_score * 100.0))
Пример #8
0
        e_tmp = list()
        for row in rows:
            if row[2] == 'Unknown':
                e_tmp.append('unknown')
            elif row[2] == 'Yes':
                e_tmp.append('yes')
            elif row[2] == 'No':
                e_tmp.append('no')
            else:
                e_tmp.append(row[2])
        ex.extend(e_tmp)

        q_text = [row[1] for row in rows]
        answer = [row[2] for row in rows]
        answer_start = [row[3] for row in rows]
        answer_end = [row[4] for row in rows]
        rationale = [row[5] for row in rows]
        rationale_start = [row[6] for row in rows]
        rationale_end = [row[7] for row in rows]
        answer_choice = [row[8] for row in rows]
        out.append(
            {'context': context, 'story_id': article['id'], 'q_text': q_text, 'answer': answer, 'answer_start': answer_start,
             'answer_end': answer_end, 'rationale': rationale, 'rationale_start': rationale_start, 'rationale_end': rationale_end,
             'answer_choice': answer_choice})

    F1 = CoQAEvaluator.compute_turn_score_seq(golds, ex)
    print('F1: %f' % F1)

with open(args.output_file, 'w') as f:
    json.dump(out, f, indent=2)
Пример #9
0
import json
import argparse
from CoQA_eval import CoQAEvaluator
from collections import Counter

parser = argparse.ArgumentParser()
parser.add_argument('-p', '--predict_file')
parser.add_argument('-d', '--dev_file')

args = parser.parse_args()
opt = vars(args)

with open(opt['dev_file'], 'r') as f:
    dev_file = json.load(f)
evaluator = CoQAEvaluator(dev_file)


def judge_yesno(gold_list):
    yesno_dict = Counter()
    for gold in gold_list:
        norm_text = CoQAEvaluator.normalize_answer(gold)
        if norm_text == 'yes':
            yesno_dict['y'] += 1
        elif norm_text == 'no':
            yesno_dict['n'] += 1
        elif norm_text == 'unknown':
            yesno_dict['u'] += 1
        else:
            yesno_dict['x'] += 1
    return yesno_dict.most_common(1)[0][0]
Пример #10
0
with open(args.data_file, 'r') as f:
    data_file = json.load(f)['data']
    data = []
    for article in data_file:
        story_id = article['id']
        answers = article['answers']
        additional_answers = article['additional_answers']
        for i, answer in enumerate(answers):
            tmp = [answer['input_text']]
            tmp.extend([
                additional_answers[key][i]['input_text']
                for key in additional_answers
            ])
            data.append(tmp)

with open(args.pred_file, 'r') as f:
    pred_file = json.load(f)

output = []
for i, (gold, pre) in enumerate(zip(data, pred_file)):
    f1 = CoQAEvaluator._compute_turn_score(gold, pre['answer'])['f1']
    if f1 <= 0.7:
        output.append({
            'id': pre['id'],
            'turn_id': pre['turn_id'],
            'gold': gold,
            'pred': pre['answer']
        })

print(json.dumps(output, indent=2))
Пример #11
0
from CoQA_eval import CoQAEvaluator
import logging

logger = logging.getLogger()

parser = argparse.ArgumentParser('description: experiments on datasets')
parser.add_argument('--pred_file')
parser.add_argument('--data_file')
args = parser.parse_args()

with open(args.pred_file, 'r') as f:
    data = json.load(f)
    predictions = []
    for answer in data:
        predictions.append(data[answer])

with open(args.data_file, 'r') as f:
    data = json.load(f)['data']
    ground_truth = []
    for article in data:
        answers = [[answer['input_text']] for answer in article['answers']]
        add_answers = article['additional_answers']
        for key in add_answers:
            for i, additional_answer in enumerate(add_answers[key]):
                answers[i].append(additional_answer['input_text'])
        ground_truth.extend(answers)

F1 = CoQAEvaluator.compute_turn_score_seq(ground_truth, predictions)

print("F1: %f" % F1)
Пример #12
0
parser.add_argument('--dev')
parser.add_argument('--output_dir')
args = parser.parse_args()

opt = vars(args)

with open(opt['with_flow'], 'r') as f:
    with_flow = json.load(f)

with open(opt['no_flow'], 'r') as f:
    no_flow = json.load(f)

with open(opt['dev'], 'r') as f:
    dev_set = json.load(f)

evaluator = CoQAEvaluator(dev_set)

dev_map = dict()

for data in dev_set['data']:
    questions = data['questions']
    answers = data['answers']
    for question, answer in zip(questions, answers):
        dev_map[(data['id'], question['turn_id'])] = (question['input_text'],
                                                      answer['input_text'])

flow_higher = []
no_flow_higher = []
equality = []

max_f1 = AverageMeter()