예제 #1
0
def main(prediction_file, golden_file):
    path = os.getcwd()
    prediction_file = os.path.join(path, prediction_file)
    golden_file = os.path.join(path, golden_file)

    actual = []
    with open(golden_file, "r") as f:
        for line in f:
            actual.append(json.loads(line))

    predictions = []
    with open(prediction_file, "r") as f:
        for line in f:
            predictions.append(json.loads(line))

    assert len(predictions) == len(
        actual), "The two file provided does not have the same number of lines"

    score, acc, precision, recall, f1 = fever_score(predictions, actual)

    tab = PrettyTable()
    tab.field_names = [
        "FEVER Score", "Label Accuracy", "Evidence Precision",
        "Evidence Recall", "Evidence F1"
    ]
    tab.add_row((round(score, 4), round(acc, 4), round(precision, 4),
                 round(recall, 4), round(f1, 4)))
    print(tab)
def main(args):

    # load the actual evaluation set - ground truth
    jlr = JSONLineReader()
    data_lines = jlr.read(args.actual_data_file)

    predictions_file_path = os.path.join(args.out_dir_rte,
                                         "refined_predictions.jsonl")
    submission_lines = jlr.read(predictions_file_path)
    sorted_lines = []
    for g1, line in enumerate(data_lines[:10]):  # debug
        instance = {}
        instance["id"] = line["id"]
        instance.update(submission_lines[g1])
        sorted_lines.append(instance)

    score, acc, precision, recall, f1 = fever_score(sorted_lines,
                                                    data_lines[:10])  # debug
    tab = PrettyTable()
    tab.field_names = [
        "FEVER Score",
        "Label Accuracy",
        "Evidence Precision",
        "Evidence Recall",
        "Evidence F1",
    ]
    tab.add_row((
        round(score, 4),
        round(acc, 4),
        round(precision, 4),
        round(recall, 4),
        round(f1, 4),
    ))
    print(tab)
예제 #3
0
    def evaluate_sentence_selection(self, loader, labels):
        """ Evaluate model on validation data

        Parameters
        ----------
        loader : data.DataLoader
            Data loader class containing validation data

        labels : dict
            Index to output class

        Returns
        -------
        ([float], [float]):
            Loss history and running loss history

        """
        jsons = []
        for i, batch in tqdm(enumerate(loader), total=len(loader)):
            X, y, json_list = batch
            for json in json_list:
                json["predicted_label"] = labels[json["predicted_label"]]
                json["label"] = labels[json["label"]]
                jsons.append(json)

        # print(f"Evaluation loss: {running_loss}")
        # print("Classification report after epoch:")
        strict_score, label_accuracy, precision, recall, f1 = fever_score(jsons)
        print(f"Fever score: {strict_score}")
        print(f"Label accuracy: {label_accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1: {f1}")
        return jsons
예제 #4
0
def main(evidence_file, golden_file):
    path = os.getcwd()
    evidence_file = os.path.join(path, evidence_file)
    golden_file = os.path.join(path, golden_file)

    actual = []
    with open(golden_file, "r") as f:
        for line in f:
            actual.append(json.loads(line))

    predictions = []
    with open(evidence_file, "r") as f:
        for i, line in enumerate(f):
            line = json.loads(line)
            line["predicted_label"] = actual[i]["label"]
            line["predicted_evidence"] = list(
                map(lambda e: e[1][:2], line["predicted_sentences"]))
            predictions.append(line)

    assert len(predictions) == len(
        actual), "The two file provided does not have the same number of lines"

    score, _, precision, recall, f1 = fever_score(predictions, actual)

    tab = PrettyTable()
    tab.field_names = [
        "OFEVER Score", "Evidence Precision", "Evidence Recall", "Evidence F1"
    ]
    tab.add_row((round(score, 4), round(precision, 4), round(recall,
                                                             4), round(f1, 4)))
    print(tab)
예제 #5
0
    def test_global_recall_partial_two_sents(self):
        instance = {
            "label": "supports",
            "predicted_label": "supports",
            "evidence": [[[None, None, "page", 0], [None, None, "page", 1]]],
            "predicted_evidence": [["page", 0], ["page", 1]]
        }

        _, _, _, r, _ = fever_score([instance], max_evidence=2)
        self.assertEqual(r, 1)
예제 #6
0
    def test_strict_partial_zero(self):
        instance = {
            "label": "supports",
            "predicted_label": "supports",
            "evidence": [[[None, None, "page", 0], [None, None, "page", 1]]],
            "predicted_evidence": [["page", 0], ["page", 1]]
        }

        strict, _, _, _, _ = fever_score([instance], max_evidence=1)
        self.assertEqual(strict, 0)
예제 #7
0
    def test_global_precision_partial_one_sent(self):
        instance = {
            "label": "supports",
            "predicted_label": "supports",
            "evidence": [[[None, None, "page", 0], [None, None, "page", 2]]],
            "predicted_evidence": [["page", 0], ["page", 1]]
        }

        _, _, p, _, _ = fever_score([instance], max_evidence=1)
        self.assertEqual(p, 1)
예제 #8
0
 def test_non_modification(self):
     instance = {
         "label": "supports",
         "predicted_label": "supports",
         "evidence": [[[None, None, "page", 0], [None, None, "page", 1]]],
         "predicted_evidence": [["page", 0], ["page", 1]]
     }
     instance_copy = instance.copy()
     _, _, _, _, _ = fever_score([instance], max_evidence=0)
     self.assertEqual(instance_copy, instance)
def train(model, optimizer, criterion, path, best_f1, epoch_num):
  
    if not model.training:
        model.train()

    epoch_loss = 0
  
    for i,batch in enumerate(train_iterator):
        model.train()
        
        optimizer.zero_grad()
        claims, sentences = batch.claim, batch.sentence
        predictions = model(claims, sentences)
        
        loss = criterion(predictions, batch.sent_label)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        
        if (i+1)%5000 == 0:
            print(f'BATCH:  {i+1}')

    # if (i+1)%10000 == 0:
    #   pdb.set_trace()
    print("--------------------------------")  
    print(f'BATCH: {i+1}')
    print("loss", epoch_loss/(i+1))
    file_data, fever_data = evaluate(model, dev_iterator, dev_path)
    fever_val, accuracy, precision, recall, f1score = fever_score(fever_data)
    print(f'Fever Score: {fever_val} | Accuracy: {accuracy}')
    print(f'Precision: {precision} | Recall: {recall} | F1Score: {f1score}')

    # if f1score > best_f1:
    # best_f1 = f1score
    print(f'Saving Model. . . ')
    torch.save(model.state_dict(), model_path+f'{epoch_num}_{f1score:0.3f}.pt')
    print(f'Model Saved Successfully!')
    pd.DataFrame(fever_data).to_csv("/content/gdrive/My Drive/NLPWikiData/fever_data_output_E"+f'{epoch_num}.csv')
    pd.DataFrame(file_data).to_json("/content/gdrive/My Drive/NLPWikiData/sen_pred_train_E"+f'{epoch_num}.jsonl', orient='records', lines=True)
    print(f'jsonl file saved for RTE')
    print("--------------------------------")  
    print("--------------------------------")  
    

    return epoch_loss / len(train_iterator), best_f1
예제 #10
0
def run_score(config):
    ids = []
    predicted_labels = []
    predicted_evidence = []
    actual = []

    with open(config['predicted_labels'], "r") as predictions_file:
        for line in predictions_file:
            predicted_labels.append(json.loads(line)["predicted"])

    with open(config['predicted_evidence'], "r") as predictions_file:
        for line in predictions_file:
            predicted_evidence.append(
                json.loads(line)["predicted_sentences"][:5])
            ids.append(json.loads(line)["id"])

    predictions = []
    for id, ev, label in zip(ids, predicted_evidence, predicted_labels):
        predictions.append({
            "id": id,
            "predicted_evidence": ev,
            "predicted_label": label
        })

    save_submission_file(predictions, config['submission'])

    with open(config['actual_file'], "r") as actual_file:
        for line in actual_file:
            actual.append(json.loads(line))

    score, acc, precision, recall, f1 = fever_score(predictions, actual)
    save_simple_result(config['score_file'], score, acc, precision, recall)
    print_confusion_mat(predictions, actual)

    tab = PrettyTable()
    tab.field_names = [
        "FEVER Score", "Label Accuracy", "Evidence Precision",
        "Evidence Recall", "Evidence F1"
    ]
    tab.add_row((round(score, 4), round(acc, 4), round(precision, 4),
                 round(recall, 4), round(f1, 4)))

    print(tab)
예제 #11
0
def check(file, threshold, max_evidence=5):
    fin = open(file, 'rb')
    instances = []

    for line in fin:
        instance = json.loads(line.decode(ENCODING))
        evidences = []
        for evidence in instance['predicted_evidence']:
            if float(evidence[2]) < threshold:
                continue
            evidence = [evidence[0], evidence[1]]
            evidences.append(evidence)
        instance['predicted_evidence'] = evidences
        instances.append(instance)

    fin.close()

    strict_score, label_accuracy, precision, recall, f1 = fever_score(
        instances, actual=None, max_evidence=max_evidence)
    print('Evidence precision:', precision)
    print('Evidence recall:', recall)
    print('Evidence f1:', f1)
예제 #12
0
def score_submission(predicted_labels_file, predicted_evidence_file,
                     actual_labels_file):
    predicted_labels = []
    predicted_evidence = []
    actual = []

    flatten = lambda l: [item for sublist in l for item in sublist]

    with open(predicted_labels_file, "r") as predictions_file:
        for line in predictions_file:
            predicted_labels.append(json.loads(line)["predicted_label"])

    with open(actual_labels_file, "r") as actual_file:
        for line in actual_file:
            actual.append(json.loads(line))

    with open(predicted_evidence_file, "r") as predictions_file:
        for line in predictions_file:
            line = json.loads(line)

            if "predicted_evidence" in line:
                predicted_evidence.append(line["predicted_evidence"])
            elif "predicted_sentences" in line:
                predicted_evidence.append(line["predicted_sentences"])
            else:
                predicted_evidence.append([[e[2], e[3]]
                                           for e in flatten(line["evidence"])])

    predictions = []

    for ev, label in zip(predicted_evidence, predicted_labels):
        predictions.append({
            "predicted_evidence": ev,
            "predicted_label": label
        })

    sdata = list(fever_score(predictions, actual))
    sdata.append(len(predictions))
    return tuple(sdata)
            fever_dict['predicted_evidence'] = [[x[3], int(x[1])] for x in sorted_predicted_sentences][:5]
            fever_dict['evidence'] = org_dev_evidence_list
            # fever_dict = "No fever Data"

        file_data.append(temp_data)
        fever_data.append(fever_dict)
    # pd.DataFrame(file_data).to_json(sen_pred_test_path, orient='records', lines=True)
    print('prob_count', prob_count)
    return file_data, fever_data

dev_file_data, dev_fever_data = evaluate(model, dev_iterator, dev_path)
pd.DataFrame(dev_file_data).to_json(sen_pred_dev_path, orient='records', lines=True)
pd.DataFrame(dev_fever_data).to_csv(dev_fever_data_path)
print("Done!")

dev_fever_val, dev_accuracy, dev_precision, dev_recall, f1score = fever_score(dev_fever_data)
print(f'Fever Score: {dev_fever_val} | Accuracy: {dev_accuracy}')
print(f'Precision: {dev_precision} | Recall: {dev_recall} | F1Score: {f1score}')

x = pd.DataFrame(dev_fever_data)

x

i = x[x['predicted_label'] == 'NOT ENOUGH INFO'].index

x.loc[i, 'predicted_evidence'] = [[]]

y = x.to_dict('records')

y[0]
예제 #14
0
# with open(args.predicted_evidence,"r") as predictions_file:
#     for line in predictions_file:
#         actual_labels2.append(json.loads(line)["label"])

# with open(args.actual, "r") as actual_file:
#     for line in actual_file:
#         actual_labels3.append(json.loads(line)["label"])
#     for actual1, actual2, actual3 in zip(actual_labels1, actual_labels2, actual_labels3):
#         assert actual1 == actual2 == actual3, "{}, {}, {}".format(actual1, actual2, actual3)

with open(args.actual, "r") as actual_file:
    for line in actual_file:
        actual.append(json.loads(line))

score, acc, precision, recall, f1 = fever_score(predictions, actual)
save_simple_result(args.score_file, score, acc, precision, recall)
print_confusion_mat(predictions, actual)
if args.err_analysis:
    save_wrong_instances(args.actual, args.predicted_labels,
                         args.predicted_evidence, args.err_analysis,
                         args.predicted_labels_supplement)

tab = PrettyTable()
tab.field_names = [
    "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall",
    "Evidence F1"
]
tab.add_row((round(score, 4), round(acc, 4), round(precision,
                                                   4), round(recall,
                                                             4), round(f1, 4)))
def score2(all_expts):
    predictions,actual = list(zip(*all_expts))
    sdata = list(fever_score(predictions,actual))
    sdata.append(len(predictions))
    return tuple(sdata)
예제 #16
0
    def evaluate(self, loader, labels):
        """ Evaluate model on validation data

        Parameters
        ----------
        loader : data.DataLoader
            Data loader class containing validation data

        labels : dict
            Index to output class

        Returns
        -------
        ([float], [float]):
            Loss history and running loss history

        """
        self.model.eval()

        jsons = []

        loss_history = []
        running_loss = 0.0
        running_loss_history = []

        # don't compute gradient
        with torch.no_grad():
            for i, batch in tqdm(enumerate(loader), total=len(loader)):
                # Split up the batch
                X, y, json_list = batch

                # Foward
                logits = self.model(X.to(self.device))
                og_shape = logits.shape

                # Reshape to be (sent len * batch size, output dim)
                logits = logits.view(-1, logits.shape[-1])

                # Compute loss & add to history
                loss = self.loss_fn(logits, y.view(-1).to(self.device))

                # no backprop
                loss_history.append(loss.item())

                running_loss += (loss_history[-1] - running_loss) / (i + 1)
                running_loss_history.append(running_loss)

                # softmax to normalize probabilities class
                probs = torch.softmax(logits, dim=-1)

                # get the output class from the probs
                # also, reshape the prediction back to sentences
                predictions = torch.argmax(probs, dim=-1).reshape(og_shape[:-1])

                for pred, json in zip(predictions.tolist(), json_list):
                    c = Counter(pred)
                    # most common value, or 1 (NEI) if it's a tie
                    most_common = 2 if c[2] > c[0] else 0 if c[0] > c[2] else 1
                    json["predicted_label"] = labels[most_common]
                    json["label"] = labels[json["label"]]
                    jsons.append(json)

        # print(f"Evaluation loss: {running_loss}")
        # print("Classification report after epoch:")
        strict_score, label_accuracy, precision, recall, f1 = fever_score(jsons)
        print(f"Fever score: {strict_score}")
        print(f"Label accuracy: {label_accuracy}")
        print(f"Precision: {precision}")
        print(f"Recall: {recall}")
        print(f"F1: {f1}")
        return loss_history, running_loss_history
예제 #17
0
                for e in evidence_set:
                    unique_evidence.add((None, None, e[2], 0))
                new_evidence.append([list(i) for i in unique_evidence])
            j['evidence'] = new_evidence
        actual.append(j)
        if 'attack' in j:
            attacks[j['attack']].append(idx)

#for pe, j in zip(predicted_evidence, actual):
#    print(pe, j['evidence'])

predictions = []
for ev, label in zip(predicted_evidence, predicted_labels):
    predictions.append({"predicted_evidence": ev, "predicted_label": label})

score, acc, precision, recall, f1 = fever_score(predictions, actual)

tab = PrettyTable()
tab.field_names = [
    "FEVER Score", "Label Accuracy", "Evidence Precision", "Evidence Recall",
    "Evidence F1"
]
tab.add_row((round(score, 4), round(acc, 4), round(precision,
                                                   4), round(recall,
                                                             4), round(f1, 4)))

print(tab)

actually = [i['label'] for i in actual]
predicted = [i['predicted_label'] for i in predictions]
print(classification_report(actually, predicted))
    "∆ Evidence Recall", " ∆ Evidence F1"
]

for i in range(1, args.num_subs + 1):

    predictions = []

    with open(args.prediction + " " + str(i), "r") as predictions_file:
        for line in predictions_file:
            predictions.append(json.loads(line))

    print("Team {0}".format(i))
    p1 = deepcopy(predictions)
    p2 = deepcopy(predictions)

    oscore, oacc, oprecision, orecall, of1 = fever_score(p1, actual_original)
    nscore, nacc, nprecision, nrecall, nf1 = fever_score(p2, actual_rescore)
    dscore, dacc, dprecision, drecall, df1 = nscore - oscore, nacc - oacc, nprecision - oprecision, nrecall - orecall, nf1 - of1

    tab = PrettyTable()
    tab.field_names = [
        "", "FEVER Score", "Label Accuracy", "Evidence Precision",
        "Evidence Recall", "Evidence F1"
    ]
    tab.add_row(("Original", round(oscore, 4), round(oacc, 4),
                 round(oprecision, 4), round(orecall, 4), round(of1, 4)))
    tab.add_row(("Rescore", round(nscore, 4), round(nacc, 4),
                 round(nprecision, 4), round(nrecall, 4), round(nf1, 4)))
    tab.add_row(("∆", round(dscore, 4), round(dacc, 4), round(dprecision, 4),
                 round(drecall, 4), round(df1, 4)))
    deltatab.add_row((round(dscore, 4), round(dacc, 4), round(dprecision, 4),
예제 #19
0
        predicted_labels.append(json.loads(line)["predicted_label"])


with open(args.predicted_evidence,"r") as predictions_file:
    for line in predictions_file:
        line = json.loads(line)
        if "predicted_sentences" in line:
            predicted_evidence.append(line["predicted_sentences"])
        elif "predicted_evidence" in line:
            predicted_evidence.append(line["predicted_evidence"])
        elif "evidence" in line:
            all_evidence = []
            for evidence_group in line["evidence"]:
                all_evidence.extend(evidence_group)

            predicted_evidence.append(list(set([(evidence[2],evidence[3]) for evidence in all_evidence])))


predictions = []
for ev,label in zip(predicted_evidence,predicted_labels):
    predictions.append({"predicted_evidence":ev,"predicted_label":label})

fever, acc, pr, rec, f1 = fever_score(predictions, actual)
print("FEVER Score: {}\n".format(fever))
print("Label Accuracy: {}\n".format(acc))
print("Evidence Precision: {}\n".format(pr))
print("Evidence F!: {}\n".format(rec))
print("Evidence F1: {}\n".format(f1))