Пример #1
0
 def evalb(self, gold, pred):
     gold = evalb_parser.create_from_bracket_string(gold)
     pred = evalb_parser.create_from_bracket_string(pred)
     result = scorer.Scorer().score_trees(gold, pred)
     prec, recall = result.prec, result.recall
     fscore = 2 * (prec * recall) / (prec + recall)
     return prec, recall, fscore
Пример #2
0
    def get_results(self):
        results = []
        for i in range(len(self.true_parsed)):
            sentence_true = self.true_parsed[i]
            sentence_test = self.test_parsed[i]
            back, a, b, c = self.Cyk.cyk(sentence_test)
            sentence = sentence_test.split(' ')
            result_test = "".join(
                ['((SENT (',
                 get_parsed(sentence, back, a, b, c), ')))'])
            result_test = result_test[1:-1]
            print("Result sentence:")
            print(result_test)

            target = parser.create_from_bracket_string(sentence_true)
            predicted = parser.create_from_bracket_string(result_test)

            s = scorer.Scorer()
            result = s.score_trees(target, predicted)

            print('The recall is: ' + str(result.recall))
            print('The precision is: ' + str(result.prec))

            results.append(result_test)

        return (results)
Пример #3
0
def compute_f1(f_gold, f_test):
    try:
        f1 = summary.summary(scorer.Scorer().score_corpus(
            f_gold, f_test)).bracker_fmeasure
        return f1
    except ZeroDivisionError:
        return 0.0
Пример #4
0
def evalb(parse1, parse2):
    pyparse1 = pyparser.create_from_bracket_string(str(parse1))
    pyparse2 = pyparser.create_from_bracket_string(str(parse2))
    score = pyscorer.Scorer().score_trees(pyparse1, pyparse2)
#     cross_brackets = score.cross_brackets
    f1 = 2 * (score.recall * score.prec) / (score.recall + score.prec)
    return f1 * score.tag_accracy
Пример #5
0
def evaluate(sentence, reference):
    gold_tree = evalbparser.create_from_bracket_string(sentence[1:-1])
    test_tree = evalbparser.create_from_bracket_string(reference[1:-1])

    s = scorer.Scorer()
    result = s.score_trees(gold_tree, test_tree)

    return result.tag_accracy
Пример #6
0
 def parse_instance(idx: int):
     scorer = evalscorer.Scorer()
     sent_ = ins_sents[idx]
     target_ = ins_trees[idx][2:-1]
     
     print("Parsing %s set sentence #%d/%d" % (dataset_choice, idx+1, num_sents))
     # Perform CYK prediction
     res_, pred_string = evaluate_predict(sent_, target_, cyk_module, scorer)
     print(res_, end='\n\n')
     return res_, pred_string
Пример #7
0
def evalb(parse1, parse2):
    from PYEVALB import scorer as pyscorer
    from PYEVALB import parser as pyparser
    pyparse1 = pyparser.create_from_bracket_string(str(parse1))
    pyparse2 = pyparser.create_from_bracket_string(str(parse2))
    try:
        score = pyscorer.Scorer().score_trees(pyparse1, pyparse2)
    except Exception as e:
        print("Exception!")
        print(e)
        print(pyparse1)
        print(pyparse2)
        return 0

    f1 = 2 * (score.recall * score.prec) / (score.recall + score.prec)
    return f1 * score.tag_accracy
Пример #8
0
    def check_action2treeseq(self):
        instance = next(iter(self.train_iterator))
        action_str_lst = self.id2original(self.ACTIONS, instance.actions)
        pos_tags = self.id2original(self.POS_TAGS, instance.pos_tags)
        converted_seq = utils.action2treestr(action_str_lst, instance.raws[0], pos_tags)

        measure = scorer.Scorer()
        golden_seq = instance.raw_seq[0]

        gold_tree = parser.create_from_bracket_string(golden_seq)
        converted_tree = parser.create_from_bracket_string(converted_seq)
        ret = measure.score_trees(gold_tree, converted_tree)
        match_num = ret.matched_brackets
        gold_num = ret.gold_brackets
        pred_num = ret.test_brackets
        assert match_num == gold_num
        assert match_num == pred_num
Пример #9
0
    def get_eval_metrics(self, instance, pred_action_ids):
        assert type(pred_action_ids) == list
        pred_actions = self.id2original(self.ACTIONS, pred_action_ids)

        tokens = instance.raws[0]
        pos_tags = self.id2original(self.POS_TAGS, instance.pos_tags)

        measure = scorer.Scorer()
        golden_tree_seq = instance.raw_seq[0]
        gold_tree = parser.create_from_bracket_string(golden_tree_seq)
        try:
            pred_tree_seq = utils.action2treestr(pred_actions, tokens, pos_tags)
            pred_tree = parser.create_from_bracket_string(pred_tree_seq)
            ret = measure.score_trees(gold_tree, pred_tree)
        except:
            return -1
        else:
            match_num = ret.matched_brackets
            gold_num = ret.gold_brackets
            pred_num = ret.test_brackets
            return match_num, gold_num, pred_num
Пример #10
0
def score(true_parse, proposed_parse):
    """
    Description
    -----------------
    Evaluate parses with the whole non terminals precision and recall, and on only POS tags
    
    Parameters
    -----------------
    true_parse, proposed_parse : Bracketed strings, the true and proposed parse trees.
    
    Returns
    -----------------
    parse_recall, parse_precision, pos_recall, pos_precision
    """

    true_parse = true_parse[2:-1]
    proposed_parse = proposed_parse[2:-1]

    gold_tree = parser.create_from_bracket_string(true_parse)
    test_tree = parser.create_from_bracket_string(proposed_parse)

    # Compute recall and precision for POS tags
    y_true = np.array(gold_tree.poss)
    y_pred = np.array(test_tree.poss)

    y_pred = (y_true == y_pred).astype(int)
    y_true = np.ones(len(y_true)).astype(int)

    (POS_precision, POS_recall, POS_f_score,
     beta) = precision_recall_fscore_support(y_true, y_pred, labels=[1])

    # Compute recall and precision for the whole parse
    thescorer = scorer.Scorer()
    result = thescorer.score_trees(gold_tree, test_tree)

    return result.recall * 100, result.prec * 100, POS_recall[
        0] * 100, POS_precision[0] * 100
Пример #11
0
def evalb(evalb_dir,
          gold_trees,
          predicted_trees,
          ref_gold_path=None,
          is_train=True):
    """
    assert os.path.exists(evalb_dir)
    evalb_program_path = os.path.join(evalb_dir, "evalb")
    evalb_spmrl_program_path = os.path.join(evalb_dir, "evalb_spmrl")
    assert os.path.exists(evalb_program_path) or os.path.exists(evalb_spmrl_program_path)

    if os.path.exists(evalb_program_path):
        evalb_param_path = os.path.join(evalb_dir, "nk.prm")
    else:
        evalb_program_path = evalb_spmrl_program_path
        evalb_param_path = os.path.join(evalb_dir, "spmrl.prm")

    assert os.path.exists(evalb_program_path)
    assert os.path.exists(evalb_param_path)
    """

    temp_dir = tempfile.TemporaryDirectory(prefix="evalb-")
    print("Temporary dir", temp_dir)

    assert len(gold_trees) == len(predicted_trees)
    for gold_tree, predicted_tree in zip(gold_trees, predicted_trees):
        assert isinstance(gold_tree, trees.TreebankNode)
        assert isinstance(predicted_tree, trees.TreebankNode)
        gold_leaves = list(gold_tree.leaves())
        predicted_leaves = list(predicted_tree.leaves())
        assert len(gold_leaves) == len(predicted_leaves)
        assert all(gold_leaf.word == predicted_leaf.word
                   for gold_leaf, predicted_leaf in zip(
                       gold_leaves, predicted_leaves))

    gold_path = os.path.join(temp_dir.name, "gold.txt")
    predicted_path = os.path.join(temp_dir.name, "predicted.txt")
    output_path = os.path.join(temp_dir.name, "output.txt")

    with open(gold_path, "w") as outfile:
        if ref_gold_path is None:
            for tree in gold_trees:
                outfile.write("{}\n".format(tree.linearize()))
        else:
            # For the SPMRL dataset our data loader performs some modifications
            # (like stripping morphological features), so we compare to the
            # raw gold file to be certain that we haven't spoiled the evaluation
            # in some way.
            with open(ref_gold_path) as goldfile:
                outfile.write(goldfile.read())

    with open(predicted_path, "w") as outfile:
        for tree in predicted_trees:
            try:
                outfile.write("{}\n".format(tree.linearize()))
            except:
                import sys
                sys.setrecursionlimit(10**6)
                outfile.write("{}\n".format(tree.linearize()))
    """
    data_dir = '/afs/inf.ed.ac.uk/group/project/prosody/prosody_nlp/data/input_features'
    perm_gold_path = os.path.join(data_dir, "sent_based_gold.txt")
    perm_predicted_path = os.path.join(data_dir, "sent_based_predicted.txt")

    with open(perm_gold_path, "w") as outfile:
        for tree in gold_trees:
            outfile.write("{}\n".format(tree.linearize()))
    with open(perm_predicted_path, "w") as outfile:
        for tree in predicted_trees:
            outfile.write("{}\n".format(tree.linearize()))
            
    command = "{} -p {} {} {} > {}".format(
        evalb_program_path,
        evalb_param_path,
        gold_path,
        predicted_path,
        output_path,
    )
    print(f'evalb shell command: {command}')
    #subprocess.run(command, shell=True)
    """

    scr = scorer.Scorer()
    scr.evalb(gold_path, predicted_path, output_path)

    # debug:
    subprocess.run("wc {}".format(predicted_path), shell=True)
    subprocess.run("wc {}".format(output_path), shell=True)

    fscore = FScore(math.nan, math.nan, math.nan, math.nan)
    """
    with open(output_path) as infile:
        for line in infile:
            match = re.match(f"Number of sentence\s+=\s+(\d+\.\d+)", line)
            if match:
                print(f'Number of sentences evaled: {match.group(1)}')
            match = re.match(r"Bracketing Recall\s+=\s+(\d+\.\d+)", line)
            if match:
                print("MATCH")
                fscore.recall = float(match.group(1))
            match = re.match(r"Bracketing Precision\s+=\s+(\d+\.\d+)", line)
            if match:
                fscore.precision = float(match.group(1))
            match = re.match(r"Bracketing FMeasure\s+=\s+(\d+\.\d+)", line)
            if match:
                fscore.fscore = float(match.group(1))
            match = re.match(r"Complete match\s+=\s+(\d+\.\d+)", line)
            if match:
                fscore.complete_match = float(match.group(1))
            match = re.match(r"Tagging accuracy\s+=\s+(\d+\.\d+)", line)
            if match:
                fscore.tagging_accuracy = float(match.group(1))
                break
    """
    with open(output_path) as infile:
        for line in infile:
            match = re.match(f"Number of sentence:\s+(\d+\.\d+)", line)
            if match:
                print(f'Number of sentences evaled: {match.group(1)}')
            match = re.match(r"Bracketing Recall:\s+(\d+\.\d+)", line)
            if match:
                print("MATCH")
                fscore.recall = float(match.group(1))
            match = re.match(r"Bracketing Precision:\s+(\d+\.\d+)", line)
            if match:
                fscore.precision = float(match.group(1))
            match = re.match(r"Bracketing FMeasure:\s+(\d+\.\d+)", line)
            if match:
                fscore.fscore = float(match.group(1))
            match = re.match(r"Complete match:\s+(\d+\.\d+)", line)
            if match:
                fscore.complete_match = float(match.group(1))
            match = re.match(r"Tagging accuracy:\s+(\d+\.\d+)", line)
            if match:
                fscore.tagging_accuracy = float(match.group(1))
                break
    #"""

    success = (not math.isnan(fscore.fscore) or fscore.recall == 0.0
               or fscore.precision == 0.0)

    if success:
        #temp_dir.cleanup()
        print("Successfully parsed in:", predicted_path)
    else:
        print("Error reading EVALB results.")
        print("Gold path: {}".format(gold_path))
        print("Predicted path: {}".format(predicted_path))
        print("Output path: {}".format(output_path))
        import pdb
        pdb.set_trace()
    return fscore
Пример #12
0
    def eval(self, insts):

        gold_path = 'tmp/gold.txt'
        pred_path = 'tmp/pred.txt'
        result_path = 'tmp/result.txt'

        if not os.path.exists('tmp'):
            os.makedirs('tmp')

        fgold = open(gold_path, 'w', encoding='utf-8')
        fpred = open(pred_path, 'w', encoding='utf-8')
        golds = []
        preds = []
        for inst in insts:
            gold = inst.get_output()
            pred = inst.get_prediction()

            golds.append(gold)
            preds.append(pred)

            fgold.write(gold.linearize() + '\n')
            fpred.write(pred.linearize() + '\n')

        fgold.close()
        fpred.close()

        return self.evalb('./EVALB', golds, preds)

        evalb = scorer.Scorer()

        fscore = FScore(0.0, 0.0, 0.0)

        try:
            evalb.evalb(gold_path, pred_path, result_path)
            with open(result_path) as infile:
                for line in infile:
                    match = re.match(r"Bracketing Recall:\s+(\d+\.\d+)", line)
                    if match:
                        fscore.recall = float(match.group(1)) / 100
                    match = re.match(r"Bracketing Precision:\s+(\d+\.\d+)",
                                     line)
                    if match:
                        fscore.precision = float(match.group(1)) / 100
                    match = re.match(r"Bracketing FMeasure:\s+(\d+\.\d+)",
                                     line)
                    if match:
                        fscore.fscore = float(match.group(1)) / 100
                        break

        except:
            pass

        # success = (
        #         not math.isnan(fscore.fscore) or
        #         fscore.recall == 0.0 or
        #         fscore.precision == 0.0)
        #
        # if success:
        #     pass
        #     # temp_dir.cleanup()
        # else:
        #     print("Error reading EVALB results.")
        #     print("Gold path: {}".format(gold_path))
        #     print("Predicted path: {}".format(pred_path))
        #     print("Output path: {}".format(result_path))

        return fscore
Пример #13
0
with open('evaluation_data.parser_output_ter.txt') as f:
    for sent in f:
        chom_gold.append(sent.rstrip())

#%%

# Removing the 'unparsed' sentences
gold = []
test = []

f = open('test_ter.txt', 'w+')
g = open('gold_ter.txt', 'w+')
for i in range(len(chom_gold)):
    print(chom_gold[i])
    if chom_gold[i] != chom_gold[3]:
        gold.append(chom_gold[i].rstrip())

        g.write(chom_gold[i] + '\n')

        test.append(chom_test[i].rstrip())

        f.write(chom_test[i] + '\n')

f.close()
g.close()

# Create scorer
s = scorer.Scorer()

# Perform the comparision
s.evalb('gold_ter.txt', 'test_ter.txt', 'results_ter.txt')
Пример #14
0
def evaluate_parser_multiprocess(pcfg,
                                 test_trees,
                                 filepath="parser_output.txt",
                                 write=True):
    """
    Method to evaluate the parser using multiprocess
    :param pcfg: parser pcfg to evaluate
    """

    y_true = []
    y_pred = []

    y_true_non_chomsky = []
    y_pred_non_chomsky = []

    y_true_parsable = []
    y_pred_parsable = []

    y_true_parsable_non_chomsky = []
    y_pred_parsable_non_chomsky = []

    recall_list = []
    precision_list = []
    lines = []

    test_trees = test_trees[:5]
    if write:
        with open(filepath, 'w') as file:
            file.write("")
        with open("non-parsable", 'w') as file:
            file.write("")

    list_sentence = []
    for c, tree in enumerate(test_trees):
        list_sentence.append(list(tree.flatten()))

    # Parsing multi_process :
    n_job = multiprocessing.cpu_count()
    start = time.time()
    with Pool(n_job) as p:
        result_trees = p.map(pcfg.CYK, list_sentence)
    print(f"Parsing time is {time.time()-start}")

    # Analysis of the result
    nb_non_parsable = 0
    list_non_parsable = []
    for (c, tree) in enumerate(test_trees):
        test_sentence = list(tree.flatten())
        parsed_tree = result_trees[c]
        test_sentence_str = ' '.join(str(tree).split())

        # If the sentence is parsable
        if parsed_tree:

            y_true.extend(get_leaves(tree))
            y_pred.extend(get_leaves(parsed_tree))
            y_true_parsable.extend(get_leaves(tree))
            y_pred_parsable.extend(get_leaves(parsed_tree))

            tree.un_chomsky_normal_form(unaryChar="&")
            parsed_tree.un_chomsky_normal_form(unaryChar="&")
            y_true_non_chomsky.extend(get_leaves(tree))
            y_pred_non_chomsky.extend(get_leaves(parsed_tree))
            y_true_parsable_non_chomsky.extend(get_leaves(tree))
            y_pred_parsable_non_chomsky.extend(get_leaves(parsed_tree))
            lines.append('( ' + ' '.join(str(parsed_tree).split()) + ')')
            parsed_tree_str = ' '.join(str(parsed_tree).split())
            test_sentence_str = ' '.join(str(tree[0]).split())

            target_tree = parser.create_from_bracket_string(test_sentence_str)
            predicted_tree = parser.create_from_bracket_string(parsed_tree_str)
            s = scorer.Scorer()
            try:
                result = s.score_trees(target_tree, predicted_tree)
                recall_list.append(result.recall)
                precision_list.append(result.prec)
            except:
                print("No Recall or precision")

            if write:
                with open(filepath, 'a') as file:
                    file.write(lines[-1] + "\n")

        # if the sentence is not parsable
        else:
            aux = get_leaves(tree)
            y_true.extend(aux)
            y_pred.extend(["None" for k in range(len(aux))])

            tree.un_chomsky_normal_form(unaryChar="&")
            y_true_non_chomsky.extend(get_leaves(tree))
            y_pred_non_chomsky.extend(
                ["None" for k in range(len(get_leaves(tree)))])

            nb_non_parsable += 1
            list_non_parsable.append(test_sentence)

            if write:
                with open(filepath, 'a') as file:
                    file.write("\n")
                with open("non-parsable", 'a') as file:
                    file.write('( ' + ' '.join(str(tree).split()) + ')' + "\n")

    print('Nb Non parsable {}'.format(nb_non_parsable))
    print('Accuracy total chomsky on dev set {}:'.format(
        accuracy(y_pred, y_true)))
    print("Accuracy total non chomsky on dev set {}:".format(
        accuracy(y_true_non_chomsky, y_pred_non_chomsky)))
    print('Accuracy parsable chomsky on dev set {}:'.format(
        accuracy(y_pred_parsable, y_true_parsable)))
    print("Accuracy parsable non chomsky on dev set {}:".format(
        accuracy(y_true_parsable_non_chomsky, y_pred_parsable_non_chomsky)))
    print("Recall moyen {} et précision moyenne {}".format(
        np.mean(recall_list), np.mean(precision_list)))
Пример #15
0
from PYEVALB import scorer as PYEVALB_scorer

# evaluation on the whole corpus
PYEVALB_scorer.Scorer().evalb(
    'results/real_parsings_test_for_eval.txt',
    'results/my_parsings_test_for_eval.txt',
    'results/results_pyevalb.txt',
)
Пример #16
0
def evalb(evalb_dir, gold_trees, predicted_trees):
    assert os.path.exists(evalb_dir)
    evalb_program_path = os.path.join(evalb_dir, "evalb")
    evalb_param_path = os.path.join(evalb_dir, "COLLINS.prm")
    assert os.path.exists(evalb_program_path)
    assert os.path.exists(evalb_param_path)

    assert len(gold_trees) == len(predicted_trees)
    for gold_tree, predicted_tree in zip(gold_trees, predicted_trees):
        assert isinstance(gold_tree, trees.TreebankNode)
        assert isinstance(predicted_tree, trees.TreebankNode)
        gold_leaves = list(gold_tree.leaves())
        predicted_leaves = list(predicted_tree.leaves())
        assert len(gold_leaves) == len(predicted_leaves)
        assert all(gold_leaf.word == predicted_leaf.word
                   for gold_leaf, predicted_leaf in zip(
                       gold_leaves, predicted_leaves))

    temp_dir = tempfile.TemporaryDirectory(prefix="evalb-")
    # tempfile.TemporaryDirectory(suffix=None, prefix=None, dir=None)¶
    # This function securely creates a temporary directory. The resulting object can
    # be used as a context manager
    output_dir = "outputs"
    gold_path = os.path.join(temp_dir.name, "gold.txt")
    predicted_path = os.path.join(temp_dir.name, "predicted.txt")
    output_path = os.path.join(output_dir, "output_test.txt")

    num = 0
    with open(gold_path, "w+") as outfile:
        for tree in gold_trees:
            if num < 5:
                print("Gold tree #{}: {}".format(num + 1, tree.linearize()))
                num += 1
            outfile.write("{}\n".format(tree.linearize()))

    num = 0
    with open(predicted_path, "w+") as outfile:
        for tree in predicted_trees:
            if num < 5:
                print("Predicted tree #{}: {}".format(num + 1,
                                                      tree.linearize()))
                num += 1
            outfile.write("{}\n".format(tree.linearize()))

    s = scorer.Scorer()
    s.evalb(gold_path, predicted_path, output_path)
    # command = "{} -p {} {} {} > {}".format(
    #     evalb_program_path,
    #     evalb_param_path,
    #     gold_path,
    #     predicted_path,
    #     output_path,
    # )
    # subprocess.run(command, shell=True)

    fscore = FScore(math.nan, math.nan, math.nan)

    with open(output_path) as infile:
        for line in infile:
            match = re.match(r"Bracketing Recall:\s*(\d+\.\d+)", line)
            if match:
                fscore.recall = float(match.group(1))
            match = re.match(r"Bracketing Precision:\s*(\d+\.\d+)", line)
            if match:
                fscore.precision = float(match.group(1))
            match = re.match(r"Bracketing FMeasure:\s*(\d+\.\d+)", line)
            if match:
                fscore.fscore = float(match.group(1))
                break

    success = (not math.isnan(fscore.fscore) or fscore.recall == 0.0
               or fscore.precision == 0.0)

    if success:
        with open("outputs/fscore_test.txt", 'a', encoding='utf-8') as f:
            f.write(fscore.print_score() + "\n")
        temp_dir.cleanup()
    else:
        print("Error reading EVALB results.")
        print("Gold path: {}".format(gold_path))
        print("Predicted path: {}".format(predicted_path))
        print("Output path: {}".format(output_path))

    return fscore
Пример #17
0
def get_diff_prods_no_span():
    print('Getting diff between', test_seqs_file, 'and', pred_seqs_file)
    diff = set()
    id = 0
    from collections import Counter
    diff_prods_counter = Counter()
    diff_heights = defaultdict(list)

    for test_line, pred_line in zip(test_seqs, pred_seqs):
        # print ('true =', true_line)
        # print ('pred =', pred_line)
        measure = scorer.Scorer()
        gold_tree = parser.create_from_bracket_string(test_line)
        pred_tree = parser.create_from_bracket_string(pred_line)

        # print (id)
        # print(test_line, pred_line)
        # print (gold_tree.sentence)
        # print (pred_tree.sentence)
        # id += 1
        ret = measure.score_trees(gold_tree, pred_tree)
        match_num = ret.matched_brackets
        gold_num = ret.gold_brackets
        pred_num = ret.test_brackets

        if match_num < gold_num or match_num < pred_num:
            pred_grammar, pred_heights = gold_tree.productions(skip_XX=False,
                                                               skip_span=False)
            true_grammar, _ = pred_tree.productions(skip_XX=False,
                                                    skip_span=False)

            # print(pred_grammar)
            # print(true_grammar)
            # diff_prods = set(pred_grammar) - set(true_grammar)
            diff_prods = []
            diff_prods_heights = []
            for id, prod in enumerate(pred_grammar):
                if prod not in true_grammar:
                    diff_prods.append(prod)
                    diff_prods_heights.append(pred_heights[id])

            for id, prod in enumerate(diff_prods):
                diff_heights[no_span_prod(prod)].append(diff_prods_heights[id])
                # if pred_heights[id] == 0:
                # print (test_line)
                # print (pred_line)
                # print ('Height 0 =', prod, no_span_prod(prod))
                # sys.exit(0)

            diff_no_span_prods = set(
                [no_span_prod(prod) for prod in diff_prods])
            diff.update(diff_no_span_prods)
            diff_prods_counter.update(diff_no_span_prods)

            # pred_tree_nltk.pretty_print()
            # true_tree_nltk.pretty_print()

    # diff_rule_count = dict([e for e in pred_rule_count.items() if e[0] in diff])
    # print ('Wrong rules')
    # print (diff_rule_count)
    # print ('Len wrong rules = ', len(diff))
    # assert len(diff) == len(diff_rule_count)

    print(diff_prods_counter.most_common(10))
    print('There are', len(diff), 'different distint productions')
    print('Done')
    print('')
    return diff, diff_prods_counter, diff_heights
Пример #18
0
available in the PYEVALB python package

@author: Víctor Manuel Tenorio
"""

import nltk
import stanza
from stanza.server import CoreNLPClient
from sacremoses import MosesDetokenizer
from PYEVALB import scorer, parser

import numpy as np
import re

detok = MosesDetokenizer()
evalb_scorer = scorer.Scorer()

recalls_corenlp = []
precs_corenlp = []
accs_corenlp = []
parsed_sents = nltk.corpus.treebank.parsed_sents()
skipped_sents = 0
sents_analyzed = 0
with CoreNLPClient(annotators=['tokenize', 'ssplit', 'pos', 'parse'],
                   output_format="json",
                   timeout=3000001,
                   endpoint='http://localhost:9001') as client:
    for i, s in enumerate(nltk.corpus.treebank.sents()):
        sent = detok.detokenize(s)
        corenlp_model = client.annotate(sent)
        gold_sent = parser.create_from_bracket_string(
Пример #19
0
def save_scores(in_file, truth_file, out_file="out.txt"):
    scorer.Scorer().evalb(in_file,
                          truth_file,
                          out_file)
Пример #20
0
    if turn in turn2subturn:
        turn2subturn[turn].append(subturn)
    else:
        turn2subturn[turn] = [subturn]

#for turn in turn2subturn:
#    turn2subturn[turn] = sorted(turn2subturn[turn])
    
temp_dir = tempfile.TemporaryDirectory(prefix="evalb-")
predicted_path = os.path.join(temp_dir.name,'pred.txt')
output_path = os.path.join(temp_dir.name,'out.txt')

with open(predicted_path,'w') as f:
    for turn in turn_ids:
        subturns = sort_subturns(turn2subturn[turn])
        trees = [subturn2predtree[subturn] for subturn in subturns]
        out_tree = '(TURN '+' '.join(trees)+')'
        outstr = tree2str(out_tree)
        goldstr = tree2str(turn2gold[turn])
        try:
            assert outstr==goldstr
        except:
            import pdb;pdb.set_trace()
        
        f.write(out_tree)
        f.write('\n')

scr = scorer.Scorer()
scr.evalb(gold_tree_file,predicted_path,output_path)
import pdb;pdb.set_trace()
    test_output = f.read().splitlines()

# Compute metrics
precisions = []
recalls = []
lengths = []
failures = 0
bugs = 0
for gold, test, sent in zip(test_output, parsed_output, test_input):
    if test == 'No parsing found':
        failures += 1
    else:
        try:
            gold_tree = parser.create_from_bracket_string(gold[2:-1])
            test_tree = parser.create_from_bracket_string(test[2:-1])
            result = scorer.Scorer().score_trees(gold_tree, test_tree)
            
            len_sentence = len(sent.split())
            lengths.append(len_sentence)
            print('')
            print('Sentence length: ' + str(len(gold)))
            print('Recall =' + str(result.recall))
            print('Precision =' + str(result.prec))
            recalls.append(result.recall)
            precisions.append(result.prec)
        except:
            bugs +=1

print('')
print('Parsing failures for ' + str(failures + bugs) + 'sentences')
Пример #22
0
        parsed_sentence = cyk_parser.parse(sentence)
        if parsed_sentence is not None:
            test_sentences_bis.append(sentence)
            f.write('%s\n' % parsed_sentence)

print('Done')

# Get accuracy
# Get sentences parsed by our parser
with open('data/evaluation_data.parser.txt', 'r') as f:
    file = f.read()
    parsed_sentences = file.split('\n')

# Remove first two and last brackets to use parser from PYEVALB
initial_parsed_sentences = []
parsed_sentences_final = []

for sent in test_sentences_bis:
    initial_parsed_sentences.append(sent[2:-1])

for sent in parsed_sentences:
    parsed_sentences_final.append(parsed_sentences[2:-1])

# Put in tree form
initial_tree = parser.create_from_bracket_string(initial_parsed_sentences)
my_tree = parser.create_from_bracket_string(parsed_sentences_final)

# Get accuracy
result = scorer.Scorer().score_trees(initial_tree, my_tree)
print('Accuracy on Evaluation set: ' + str(result.tag_accracy))
Пример #23
0
def evaluation():
    #####################################################################
    #                              Load data                            #
    #####################################################################
    with codecs.open("output.txt", 'r', 'UTF-8') as file:
        result = file.read()
    file.close()
    result = result.split()
    result_tree = []
    i=-1
    for r in result:
        if 'None' in r :
            result_tree.append('(SENT (NC <UNKNOWN>))')
            i += 1
        elif 'SENT' in r :
            result_tree.append(r)
            i += 1
        else :
            result_tree[i] = result_tree[i] + ' ' + r

    with codecs.open("sequoia_test_tree.txt", 'r', 'UTF-8') as file:
        truth = file.read()
    file.close()
    truth = truth.split()
    truth_tree = []
    i=-1
    for t in truth:
        if 'SENT' in t:
            truth_tree.append(t)
            i += 1
        else :
            truth_tree[i] = truth_tree[i] + ' ' + t
    
    assert(len(result_tree)==len(truth_tree))
    N = len(result_tree)
    
    #####################################################################
    #                            Evaluation                             #
    #####################################################################
    recall = []
    precision = []
    Fscore=[]
    tag_accuracy=[]
    
    S = scorer.Scorer()
    fileOut = codecs.open("evaluation_data.parser_output", 'w', 'UTF-8')
    
    for i in range(N):
        t = parser.create_from_bracket_string(truth_tree[i])
        r = parser.create_from_bracket_string(result_tree[i])
        
        fileOut.write(" ".join(str(t.non_terminal_labels)))
        fileOut.write('\n')
        
        if t.sentence == r.sentence :
            scores = S.score_trees(t, r)
            recall.append(scores.recall)
            precision.append(scores.prec)
            Fscore.append(2*scores.recall*scores.prec/(scores.prec+scores.recall))
            tag_accuracy.append(scores.tag_accracy)
    
    print('Average recall : ', np.mean(recall))
    print('Average precision : ', np.mean(precision))
    print('Average F-score: ', np.mean(Fscore))
    print('Average tag accuracy: ', np.mean(tag_accuracy))

    return()
Пример #24
0
    with open('results/evaluation_data.parser_output', 'a') as f:
        if my_parsing is None:
            f.write("Found no viable parsing." + "\n")
        else:
            f.write(my_parsing + "\n")

    if my_parsing is not None:
        # EVALPB works if we remove first and last brackets of the SEQUOIA format and the extra spaces that come with it
        real_parsing = real_parsing[2:-1]
        my_parsing = my_parsing[2:-1]

        print("Score PYEVALB:")
        real_tree = parser.create_from_bracket_string(real_parsing)
        test_tree = parser.create_from_bracket_string(my_parsing)
        result = scorer.Scorer().score_trees(real_tree, test_tree)
        print('accuracy ' + str(result.tag_accracy))

        # for evaluation on the whole corpus, we save real_parsing
        # and_my_parsing in new files without first and last brackets
        with open('results/real_parsings_test_for_eval.txt', 'a') as f:
            f.write(real_parsing + "\n")

        with open('results/my_parsings_test_for_eval.txt', 'a') as f:
            f.write(my_parsing + "\n")

save_scores(
    'results/real_parsings_test_for_eval.txt',
    'results/my_parsings_test_for_eval.txt',
    'results/results_pyevalb.txt',
)
Пример #25
0
def pyevalb(pred_path, gold_path, result_path):
    """Use PYEVALB to score trees."""
    scorer.Scorer().evalb(gold_path, pred_path, result_path)
Пример #26
0
        if not s_output:
            continue

        print('input --> ', s_input)
        print('input labels:', s_target)
        print('output -->', extract_sentence(s_output))
        print('output labels:', s_output)

        target_tree = evalb_parser.create_from_bracket_string(s_target[1:-1])
        output_tree = evalb_parser.create_from_bracket_string(s_output[1:-1])

        # print(target_tree)
        # print(output_tree)

        try:
            s = evalb_scorer.Scorer()
            result = s.score_trees(target_tree, output_tree)

            print(
                f'sentence {k}, precision={result.prec}, recall={result.recall}'
            )
            total_precision += result.prec
            total_recall += result.recall
            print(
                f'average so far: precision={total_precision/(k+1)}, recall={total_recall/(k+1)}'
            )
        except:
            print(f'sentence {k}, scorer failed')

        # break
Пример #27
0
def get_p_value(baseline_f,gold_f,experiment_f,ids):
    print(experiment_f.split('/')[-1])
    gold_lines = [l.strip() for l in open(gold_f).readlines()]
    experiment_lines = [l.strip() for l in open(experiment_f).readlines()]
    baseline_lines = [l.strip() for l in open(baseline_f).readlines()]

    assert len(gold_lines)==len(ids)
    assert len(gold_lines)==len(experiment_lines)
    assert len(gold_lines)==len(baseline_lines)

    num_lines = len(gold_lines)


    scr = scorer.Scorer()
    print('Calculate baseline...')
    fullset_experiment_f1 = scr.get_f1_from_list(gold_lines,experiment_lines)
    fullset_baseline_f1 = scr.get_f1_from_list(gold_lines,baseline_lines)

    fullset_delta = abs(fullset_experiment_f1 - fullset_baseline_f1)

    big_diffs = 0

    id2matched = {'experiment':{},
                  'baseline':{}}
    id2gold = {'experiment':{},
               'baseline':{}}
    id2test = {'experiment':{},
               'baseline':{}}

    model2output = {'experiment':experiment_lines,
                    'baseline':baseline_lines}

    print('Store bracket scores ...')
    for model in ('experiment','baseline'):
        test_lines = model2output[model]
        for i,turn in enumerate(ids):
            gold_line = gold_lines[i]
            test_line = test_lines[i]
            assert tree2str(gold_line)==tree2str(test_line)
            mat,gol,tes = scr.get_bracket_counts_from_tree(gold_line,test_line)
            id2matched[model][turn] = mat
            id2gold[model][turn] = gol
            id2test[model][turn] = tes
        
    idxs = [i for i in range(len(gold_lines))]
    print('Resample ...')
    for i in range(num_resamples):
        if i%100000==1:
            print(i)
        gold_resamples = []
        experiment_resamples = []
        baseline_resamples = []
        resample_idx = random.choices(idxs,k=len(idxs))
        resampled_turns = []
        for idx in resample_idx:
            resampled_turns.append(ids[idx])

        gold_brackets = sum([id2gold['experiment'][turn] for turn in resampled_turns])
        experiment_matched_brackets = sum([id2matched['experiment'][turn] for turn in resampled_turns])
        experiment_test_brackets = sum([id2test['experiment'][turn] for turn in resampled_turns])
        experiment_rec = experiment_matched_brackets/gold_brackets
        experiment_prec = experiment_matched_brackets/experiment_test_brackets
        experiment_f1 = ((2*experiment_rec*experiment_prec)/(experiment_rec+experiment_prec))*100

    
        baseline_matched_brackets = sum([id2matched['baseline'][turn] for turn in resampled_turns])
        baseline_test_brackets = sum([id2test['baseline'][turn] for turn in resampled_turns])
        baseline_rec = baseline_matched_brackets/gold_brackets
        baseline_prec = baseline_matched_brackets/baseline_test_brackets
        baseline_f1 = ((2*baseline_rec*baseline_prec)/(baseline_rec+baseline_prec))*100

        #if experiment_f1 - baseline_f1 > (2*fullset_delta):
        curr_delta = abs(experiment_f1 - baseline_f1)
        if curr_delta > (2*fullset_delta):
            big_diffs += 1
    print(f'p-value estimate: {big_diffs/num_resamples}')