Пример #1
0
def save_submission_file(predictions, path):
    out = list()
    for pred in predictions:
        out.append({
            "id": pred["id"],
            "predicted_label": pred["predicted_label"],
            "predicted_evidence": pred["predicted_evidence"]
        })
    save_jsonl(out, path)
def save_predictions(instances,
                     preds_list,
                     path,
                     scores_for_all_candidates=True):
    '''
    保存预测的结果
    '''
    store = list()
    assert len(instances) == len(preds_list)

    for instance, preds in zip(instances, preds_list):
        cid = instance['id']
        claim = instance['claim']
        pred_sents = instance['evidence']

        # 取每个样例预测的所有标签及其得分
        if scores_for_all_candidates:
            pred_labels_list = [[pred.text for pred in preds_instance]
                                for preds_instance in preds]
            scores = [[float(pred.score) for pred in preds_instance]
                      for preds_instance in preds]
        # 取每个样例的第一个标签及得分
        else:
            pred_labels = [pred[0].text for pred in preds]
            scores = [float(pred[0].score) for pred in preds]

        # 保存为字典形式
        dic = {
            'id': cid,
            'scores': scores,
            'claim': claim,
            'predicted_sentences': pred_sents
        }

        if 'label' in instance:
            dic['label'] = instance['label']

        if scores_for_all_candidates:
            dic['predicted_labels'] = [[
                convert_label(pred_label, inverse=True)
                for pred_label in pred_labels
            ] for pred_labels in pred_labels_list],
        else:
            dic['predicted_labels'] = [
                convert_label(pred_label, inverse=True)
                for pred_label in pred_labels
            ]

        # scores of ir part
        if 'scored_sentences' in instance:
            dic['ev_scores'] = instance['scored_sentences']

        store.append(dic)
    save_jsonl(store, path)
Пример #3
0
def save_predictions_preprocessed(instances,
                                  all_settings,
                                  preds_list,
                                  path,
                                  n_sentences=5,
                                  scores_for_all_candidates=True):
    store = {}
    print('prepare dictionary...')
    for instance in instances:
        id = instance["id"]
        claim = instance["claim"]
        pred_sents = instance["evidence"]
        if scores_for_all_candidates:
            scores = [[float(0)] * 3] * len(pred_sents)
            pred_labels_list = [['NOT ENOUGH INFO'] * 3] * len(pred_sents)
        else:
            scores = [float(0)] * len(pred_sents)
            pred_labels_list = ["NOT ENOUGH INFO" for pred in pred_sents]
        dic = {
            "id": id,
            "scores": scores,
            "claim": claim,
            "predicted_sentences": pred_sents,
            "predicted_labels": pred_labels_list
        }
        if "label" in instance:
            dic["label"] = instance["label"]

        if "scored_sentences" in instance:
            dic["ev_scores"] = instance["scored_sentences"]

        store[id] = dic

    assert len(all_settings) == len(preds_list)
    print('index entries...')
    for (setting, _), pred in zip(all_settings, preds_list):
        q_id = int(setting.id.split('-')[0])
        s_id = int(setting.id.split('-')[1])
        if s_id >= n_sentences:
            continue
        print("====", s_id, len(store[q_id]["scores"]), len(pred))
        store[q_id]["scores"][s_id] = [float(p.score) for p in pred]

        if scores_for_all_candidates:
            store[q_id]["predicted_labels"][s_id] = [
                convert_label(p.text, inverse=True) for p in pred
            ]
        else:
            store[q_id]["predicted_labels"][s_id] = convert_label(
                pred[0].label)

    store = [v for k, v in store.items()]
    save_jsonl(store, path)
Пример #4
0
def save_predictions(instances,
                     preds_list,
                     path,
                     scores_for_all_candidates=True):
    store = list()
    assert len(instances) == len(preds_list)
    for instance, preds in zip(instances, preds_list):
        id = instance["id"]
        claim = instance["claim"]
        pred_sents = instance["evidence"]  # refer to read_ir_result
        if scores_for_all_candidates:
            pred_labels_list = [[pred.text for pred in preds_instance]
                                for preds_instance in preds]
            scores = [[float(pred.score) for pred in preds_instance]
                      for preds_instance in preds]
        else:
            pred_labels = [pred[0].text for pred in preds]
            scores = [float(pred[0].score) for pred in preds]

        dic = {
            "id": id,
            "scores": scores,
            "claim": claim,
            "predicted_sentences": pred_sents
        }
        if "label" in instance:
            dic["label"] = instance["label"]

        if scores_for_all_candidates:
            dic["predicted_labels"] = [[
                convert_label(pred_label, inverse=True)
                for pred_label in pred_labels
            ] for pred_labels in pred_labels_list]
        else:
            dic["predicted_labels"] = [
                convert_label(pred_label, inverse=True)
                for pred_label in pred_labels
            ]

        # scores of ir part
        if "scored_sentences" in instance:
            dic["ev_scores"] = instance["scored_sentences"]

        store.append(dic)

    save_jsonl(store, path)
Пример #5
0
def run_aggregator(config):
    train_set = Predicted_Labels_Dataset(config['train_file'], config['n_sentences'], sampling=config['sampling'], use_ev_scores=config['evi_scores'])
    dev_set = Predicted_Labels_Dataset(config['dev_file'], config['n_sentences'], use_ev_scores=config['evi_scores'])

    train_dataloader = DataLoader(train_set, batch_size=64, shuffle=True, num_workers=0)
    dev_dataloader = DataLoader(dev_set, batch_size=64, shuffle=False, num_workers=0)

    model = Net(layers=[int(width) for width in config['layers']])

    class_weights = [1.0, 1.0, 1.0]
    label2freq = Counter((instance['label'] for instance in train_set.instances))
    total = sum(label2freq.values())
    for label in label2freq:
        class_weights[label2idx[label]] = 1.0 / (label2freq[label]) * total

    criterion = nn.CrossEntropyLoss(weight=torch.tensor(class_weights))
    optimizer = optim.Adam(model.parameters())

    dev_results = []

    for epoch in range(config['epochs']):
        running_loss = 0.0

        for i, (labels, inputs) in enumerate(train_dataloader):
            optimizer.zero_grad()

            outputs = model(inputs.float())
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

            if i % 1000 == 999:
                print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 1000))
                running_loss = 0.0
        dev_results.append(simple_test(dev_dataloader, model))

    print('Finished Training.')
    performance = max(dev_results)
    print('dev set:', performance)

    train_result = predict(train_dataloader, model)
    dev_results = predict(dev_dataloader, model)
    save_jsonl(train_result, config['train_predicted_labels'])
    save_jsonl(dev_results, config['dev_predicted_labels'])
Пример #6
0

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("src")
    parser.add_argument("tar")
    parser.add_argument("--use_ir_pred", action="store_true")
    parser.add_argument("--prependlinum", action="store_true")
    parser.add_argument("--prependtitle", action="store_true")
    parser.add_argument("--convert_test", action="store_true")
    # parser.add_argument("--testset", help="turn on when you convert test data", action="store_true")
    args = parser.parse_args()
    print(args)

    if args.convert_test:
        test_in = '''[{"id": 15812, "verifiable": "VERIFIABLE", "label": "REFUTES", "claim": "Peggy Sue Got Married is a Egyptian film released in 1986.", "evidence": [[[31205, 37902, "Peggy_Sue_Got_Married", 0], [31205, 37902, "Francis_Ford_Coppola", 0]], [[31211, 37908, "Peggy_Sue_Got_Married", 0]]], "predicted_pages": ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", "Peggy_Sue_Got_Married_-LRB-song-RRB-", "Peggy_Sue_Got_Married", "Peggy_Sue", "Peggy_Sue_-LRB-band-RRB-"], "predicted_sentences": [["Peggy_Sue_Got_Married", 0], ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", 0], ["Peggy_Sue_Got_Married_-LRB-song-RRB-", 0], ["Peggy_Sue", 0], ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", 2]]}, {"id": 229289, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Neal Schon was named in 1954.", "evidence": [[[273626, null, null, null]]], "predicted_pages": ["Neal_Schon", "Neal", "Named", "Was_-LRB-Not_Was-RRB-", "Was"], "predicted_sentences": [["Neal_Schon", 0], ["Neal_Schon", 6], ["Neal_Schon", 5], ["Neal_Schon", 1], ["Neal_Schon", 2]]}]'''

        print("input:\n", test_in)
        fever_format = json.loads(test_in)
        snli_format_instances = convert(fever_format, prependlinum=args.prependlinum, prependtitle=args.prependtitle, use_ir_prediction=args.use_ir_pred)
        print("\noutput:\n", json.dumps(snli_format_instances, indent=4))

    else:
        assert not os.path.exists(args.tar), "file {} alreadly exists".format(
            args.tar)
        keyerr_count = 0

        instances = read_jsonl(args.src)
        snli_format_instances = convert(instances, prependlinum=args.prependlinum, prependtitle=args.prependtitle, use_ir_prediction=args.use_ir_pred)
        save_jsonl(snli_format_instances, args.tar)
Пример #7
0
                # forward + backward + optimize
                outputs = net(inputs.float())
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                # print statistics
                running_loss += loss.item()
                if i % 1000 == 999:  # print every 1000 mini-batches
                    print('[%d, %5d] loss: %.3f' %
                          (epoch + 1, i + 1, running_loss / 1000))
                    running_loss = 0.0
            # monitor dev loss throughout training
            dev_results_throughout_training.append(simple_test(dev_dataloader))

        print('Finished Training')

        print("dev set:")
        performance = simple_test(dev_dataloader)
        #hyperparameter2performance[n_sentences] = performance
        hyperparameter2performance[n_sentences] = max(
            dev_results_throughout_training)

    for k, v in sorted(hyperparameter2performance.items()):
        print(v)

    dev_results = predict(dev_dataloader)
    test_results = predict(test_dataloader)
    save_jsonl(dev_results, args.predicted_labels)
    save_jsonl(test_results, args.test_predicted_labels)
Пример #8
0
    pattern = re.compile('\w+|[^\w\s]')

    if args.convert_test:
        test_in = '''[{"id": 15812, "verifiable": "VERIFIABLE", "label": "REFUTES", "claim": "Peggy Sue Got Married is a Egyptian film released in 1986.", "evidence": [[[31205, 37902, "Peggy_Sue_Got_Married", 0], [31205, 37902, "Francis_Ford_Coppola", 0]], [[31211, 37908, "Peggy_Sue_Got_Married", 0]]], "predicted_pages": ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", "Peggy_Sue_Got_Married_-LRB-song-RRB-", "Peggy_Sue_Got_Married", "Peggy_Sue", "Peggy_Sue_-LRB-band-RRB-"], "predicted_sentences": [["Peggy_Sue_Got_Married", 0], ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", 0], ["Peggy_Sue_Got_Married_-LRB-song-RRB-", 0], ["Peggy_Sue", 0], ["Peggy_Sue_Got_Married_-LRB-musical-RRB-", 2]]}, {"id": 229289, "verifiable": "NOT VERIFIABLE", "label": "NOT ENOUGH INFO", "claim": "Neal Schon was named in 1954.", "evidence": [[[273626, null, null, null]]], "predicted_pages": ["Neal_Schon", "Neal", "Named", "Was_-LRB-Not_Was-RRB-", "Was"], "predicted_sentences": [["Neal_Schon", 0], ["Neal_Schon", 6], ["Neal_Schon", 5], ["Neal_Schon", 1], ["Neal_Schon", 2]]}, {"id": 15711, "verifiable": "VERIFIABLE", "label": "SUPPORTS", "claim": "Liverpool F.C. was valued at $1.55 billion at one point.", "evidence": [[[31112, 37788, "Liverpool_F.C.", 11]]], "predicted_pages": ["Liverpool_F.C.", "Liverpool_F.C._-LRB-Montevideo-RRB-", "Liverpool_F.C._-LRB-Superleague_Formula_team-RRB-", "Liverpool_F.C._-LRB-disambiguation-RRB-", "Liverpool"], "predicted_sentences": [["Liverpool_F.C.", 11], ["Liverpool", 0], ["Liverpool", 9], ["Liverpool", 10], ["Liverpool", 8]]}]'''

        print("input:\n", test_in)
        fever_format = json.loads(test_in)
        snli_format_instances = convert(
            fever_format,
            prependlinum=args.prependlinum,
            prependtitle=args.prependtitle,
            use_ir_prediction=args.use_ir_pred,
            n_sentences=args.n_sentences,
            depparse_batch_size=args.depparse_batch_size)
        print("\noutput:\n", json.dumps(snli_format_instances, indent=4))

    else:
        if os.path.exists(args.tar):
            print("WARNING: file {} alreadly exists".format(args.tar))
        keyerr_count = 0

        instances = read_jsonl(args.src)
        snli_format_instances = convert(
            instances,
            prependlinum=args.prependlinum,
            prependtitle=args.prependtitle,
            use_ir_prediction=args.use_ir_pred,
            n_sentences=args.n_sentences,
            depparse_batch_size=args.depparse_batch_size)
        save_jsonl(snli_format_instances, args.tar, skip_if_exists=False)
Пример #9
0
    results = list()
    preds_length = list()
    all_settings = list()
    instances = read_ir_result(args.in_file, prependlinum=args.prependlinum, concatev=args.concatev)
    for instance in instances:
        evidence_list = instance["evidence"]
        claim = instance["claim"]
        settings = [QASetting(question=claim, support=[evidence]) for evidence in evidence_list]
        all_settings.append(settings)

    # pointer loops from 0 to less than (or equal to) len(all_settings) with step args.batch_size
    preds_list = list()
    for pointer in tqdm(range(0, len(all_settings), args.batch_size)):
        batch_settings = all_settings[pointer: pointer + args.batch_size]
        n_settings = [len(settings_) for settings_ in batch_settings]
        preds_list.extend(reshape(dam_reader(flatten(batch_settings)), n_settings))

    results = list()
    for instance, preds in zip(instances, preds_list):
        prediction, scores, prediction_list = aggregate_preds(preds, args.only_use_topev)
        results.append({
            "actual": instance["label"],
            "predicted":
            convert_label(prediction, inverse=True),
            "scores":
            scores,
            "prediction_list":
            [convert_label(pred, inverse=True) for pred in prediction_list]
        })
    save_jsonl(results, abs_path(args.out_file))