Exemplo n.º 1
0
def build_score_validator(baseline_grammar, grammarInfo, nont_map,
                          storageManager, term_labelling, parser,
                          corpus_validation, validationMethod):
    validator = PyCandidateScoreValidator(grammarInfo, storageManager,
                                          validationMethod)

    # parser = GFParser(baseline_grammar)
    tree_count = 0
    der_count = 0
    for gold_tree in corpus_validation:
        tree_count += 1
        parser.set_input(
            term_labelling.prepare_parser_input(gold_tree.token_yield()))
        parser.parse()
        derivations = [der for _, der in parser.k_best_derivation_trees()]
        manager = PyDerivationManager(baseline_grammar, nont_map)
        manager.convert_hypergraphs(derivations)
        scores = []

        relevant = set([tuple(t) for t in gold_tree.labelled_spans()])

        for der in derivations:
            der_count += 1

            h_tree = ConstituentTree()
            cleaned_tokens = copy.deepcopy(gold_tree.full_token_yield())
            dcp = DCP_evaluator(der).getEvaluation()
            dcp_to_hybridtree(h_tree, dcp, cleaned_tokens, False,
                              construct_constituent_token)

            retrieved = set([tuple(t) for t in h_tree.labelled_spans()])
            inters = retrieved & relevant

            # in case of parse failure there are two options here:
            #   - parse failure -> no spans at all, thus precision = 1
            #   - parse failure -> a dummy tree with all spans wrong, thus precision = 0

            precision = 1.0 * len(inters) / len(retrieved) \
                if len(retrieved) > 0 else 0
            recall = 1.0 * len(inters) / len(relevant) \
                if len(relevant) > 0 else 0
            fmeasure = 2.0 * precision * recall / (precision + recall) \
                if precision + recall > 0 else 0

            if validationMethod == "F1":
                scores.append(fmeasure)
            elif validationMethod == "Precision":
                scores.append(precision)
            elif validationMethod == "Recall":
                scores.append(recall)
            else:
                raise ()

        validator.add_scored_candidates(manager, scores,
                                        1.0 if len(relevant) > 0 else 0.0)
        # print(tree_count, scores)
        parser.clear()

    print("trees used for validation ", tree_count, "with",
          der_count * 1.0 / tree_count, "derivations on average")

    return validator
Exemplo n.º 2
0
def do_parsing(parser, corpus):
    accuracy = ParseAccuracyPenalizeFailures()
    system_trees = []

    start_at = time.time()

    n = 0

    for tree in corpus:

        if not tree.complete() \
                or tree.empty_fringe() \
                or not 0 < len(tree.word_yield()) <= max_length:
            continue

        parser.set_input(
            terminal_labeling.prepare_parser_input(tree.token_yield()))
        parser.parse()
        if not parser.recognized():
            relevant = tree.labelled_spans()
            accuracy.add_failure(relevant)

            system_trees.append(
                dummy_constituent_tree(tree.token_yield(),
                                       tree.full_token_yield(), "NP", "S"))
            # print('failure', tree.sent_label()) # for testing
        else:
            n += 1
            dcp_tree = ConstituentTree()
            punctuation_positions = [
                i + 1 for i, idx in enumerate(tree.full_yield())
                if idx not in tree.id_yield()
            ]
            dcp_tree = parser.dcp_hybrid_tree_best_derivation(
                dcp_tree,
                tree.full_token_yield(),
                False,
                construct_constituent_token,
                punctuation_positions=punctuation_positions)

            retrieved = dcp_tree.labelled_spans()
            relevant = tree.labelled_spans()
            accuracy.add_accuracy(retrieved, relevant)

            system_trees.append(dcp_tree)

        parser.clear()

    end_at = time.time()

    print('Parsed:', n)
    if accuracy.n() > 0:
        print('Recall:', accuracy.recall())
        print('Precision:', accuracy.precision())
        print('F-measure:', accuracy.fmeasure())
        print('Parse failures:', accuracy.n_failures())
    else:
        print('No successful parsing')
    print('time:', end_at - start_at)
    print('')

    name = parse_results
    # do not overwrite existing result files
    i = 1
    while os.path.isfile(
            os.path.join(parse_results_prefix, name + parse_results_suffix)):
        i += 1
        name = parse_results + '_' + str(i)

    path = os.path.join(parse_results_prefix, name + parse_results_suffix)
    #
    # with open(path, 'w') as result_file:
    #     print('Exporting parse trees of length <=', max_length, 'to', str(path))
    #     map(lambda x: x.strip_vroot(), system_trees)
    #     result_file.writelines(hybridtrees_to_sentence_names(system_trees, test_start, max_length))

    return accuracy