def test_extract_system_mentions(self):
        expected_spans = sorted([
            spans.Span(0, 1),
            spans.Span(0, 5),
            spans.Span(3, 5),
            spans.Span(5, 5),
            spans.Span(8, 10),
            spans.Span(8, 11),
            spans.Span(13, 16),
            spans.Span(13, 20),
            spans.Span(14, 14),
            spans.Span(18, 20),
            spans.Span(22, 23),
            spans.Span(25, 25),
            spans.Span(33, 34)
        ])

        self.assertEqual(expected_spans, [
            mention.span
            for mention in mention_extractor.extract_system_mentions(
                self.real_document, filter_mentions=False)[1:]
        ])

        expected_spans = sorted([
            spans.Span(2, 2),
            spans.Span(4, 4),
            spans.Span(6, 7),
            spans.Span(6, 11),
            spans.Span(9, 10),
            spans.Span(9, 11)
        ])

        self.assertEqual(expected_spans, [
            mention.span
            for mention in mention_extractor.extract_system_mentions(
                self.another_real_document, filter_mentions=False)[1:]
        ])

        expected_spans = sorted([
            spans.Span(2, 2),
            spans.Span(4, 4),
            spans.Span(6, 11),
            spans.Span(9, 10),
            spans.Span(9, 11)
        ])

        self.assertEqual(expected_spans, [
            mention.span
            for mention in mention_extractor.extract_system_mentions(
                self.another_real_document, filter_mentions=True)[1:]
        ])
Пример #2
0
    def test_extract_system_mentions(self):
        expected_spans = sorted([
            spans.Span(0, 1),
            spans.Span(0, 5),
            spans.Span(3, 5),
            spans.Span(5, 5),
            spans.Span(8, 10),
            spans.Span(8, 11),
            spans.Span(13, 16),
            spans.Span(13, 20),
            spans.Span(14, 14),
            spans.Span(18, 20),
            spans.Span(22, 23),
            spans.Span(25, 25),
            spans.Span(33, 34)
        ])

        self.assertEqual(expected_spans,
                         [mention.span for
                          mention in mention_extractor.extract_system_mentions(
                             self.real_document, filter_mentions=False)[1:]])

        expected_spans = sorted([
            spans.Span(2, 2),
            spans.Span(4, 4),
            spans.Span(6, 7),
            spans.Span(6, 11),
            spans.Span(9, 10),
            spans.Span(9, 11)
        ])

        self.assertEqual(expected_spans,
                         [mention.span for
                          mention in mention_extractor.extract_system_mentions(
                             self.another_real_document,
                             filter_mentions=False)[1:]])

        expected_spans = sorted([
            spans.Span(2, 2),
            spans.Span(4, 4),
            spans.Span(6, 11),
            spans.Span(9, 10),
            spans.Span(9, 11)
        ])

        self.assertEqual(expected_spans,
                         [mention.span for
                          mention in mention_extractor.extract_system_mentions(
                             self.another_real_document,
                             filter_mentions=True)[1:]])
Пример #3
0
    def do_coreference(self):
        testing_corpus = corpora.Corpus("input", [
            self.p.run_on_doc(io.StringIO(self.txt.get("0.0", tki.END)),
                              "input")
        ])

        logging.info("Extracting system mentions.")
        for doc in testing_corpus:
            doc.system_mentions = mention_extractor.extract_system_mentions(
                doc)

        mention_entity_mapping, antecedent_mapping = experiments.predict(
            testing_corpus, self.extractor, self.perceptron,
            clusterer.all_ante)

        testing_corpus.read_coref_decisions(mention_entity_mapping,
                                            antecedent_mapping)

        logging.info("Visualize")

        for doc in testing_corpus:
            max_id = 0

            for mention in doc.system_mentions[1:]:
                set_id = mention.attributes["set_id"]

                if set_id:
                    max_id = max(set_id, max_id)

            max_id += 1

            doc.annotated_mentions = []

            for i, mention in enumerate(doc.system_mentions[1:]):
                if mention.attributes["set_id"]:
                    mention.attributes[
                        "annotated_set_id"] = mention.attributes["set_id"]
                else:
                    mention.attributes["annotated_set_id"] = max_id + i
                doc.annotated_mentions.append(mention)

        ex = error_extractors.ErrorExtractor(
            testing_corpus, spanning_tree_algorithms.recall_accessibility,
            spanning_tree_algorithms.precision_system_output)

        ex.add_system(testing_corpus)

        decisions = ex.get_errors()

        visualizer = visualization.Visualizer(decisions,
                                              "input",
                                              for_raw_input=True)

        visualizer.run()
Пример #4
0
    def do_coreference(self):
        testing_corpus = corpora.Corpus("input", [self.p.run_on_doc(
            io.StringIO(self.txt.get("0.0", tki.END)), "input")])

        logging.info("Extracting system mentions.")
        for doc in testing_corpus:
            doc.system_mentions = mention_extractor.extract_system_mentions(doc)

        mention_entity_mapping, antecedent_mapping = experiments.predict(
            testing_corpus,
            self.extractor,
            self.perceptron,
            clusterer.all_ante
        )

        testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping)

        logging.info("Visualize")

        for doc in testing_corpus:
            max_id = 0

            for mention in doc.system_mentions[1:]:
                set_id = mention.attributes["set_id"]

                if set_id:
                    max_id = max(set_id, max_id)

            max_id += 1

            doc.annotated_mentions = []

            for i, mention in enumerate(doc.system_mentions[1:]):
                if mention.attributes["set_id"]:
                    mention.attributes["annotated_set_id"] = mention.attributes[
                        "set_id"]
                else:
                    mention.attributes["annotated_set_id"] = max_id + i
                doc.annotated_mentions.append(mention)

        ex = error_extractors.ErrorExtractor(testing_corpus,
                                         spanning_tree_algorithms.recall_accessibility,
                                         spanning_tree_algorithms.precision_system_output)

        ex.add_system(testing_corpus)

        decisions = ex.get_errors()

        visualizer = visualization.Visualizer(decisions, "input",
                                              for_raw_input=True)

        visualizer.run()
Пример #5
0
perceptron = import_helper.import_from_path(args.perceptron)(priors=priors,
                                                             weights=weights,
                                                             cost_scaling=0)

extractor = instance_extractors.InstanceExtractor(
    import_helper.import_from_path(args.extractor), mention_features,
    pairwise_features, cost_functions.null_cost, perceptron.get_labels())

logging.info("Reading in data.")
testing_corpus = corpora.Corpus.from_file(
    "testing", codecs.open(args.input_filename, "r", "utf-8"))

logging.info("Extracting system mentions.")
for doc in testing_corpus:
    doc.system_mentions = mention_extractor.extract_system_mentions(doc)

mention_entity_mapping, antecedent_mapping = experiments.predict(
    testing_corpus, extractor, perceptron,
    import_helper.import_from_path(args.clusterer))

testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping)

logging.info("Write corpus to file.")
testing_corpus.write_to_file(codecs.open(args.output_filename, "w", "utf-8"))

if args.ante:
    logging.info("Write antecedent decisions to file")
    testing_corpus.write_antecedent_decisions_to_file(open(args.ante, "w"))

if args.gold:
Пример #6
0
def call_cort(text_blob):

    mention_features = [
        features.fine_type, features.gender, features.number,
        features.sem_class, features.deprel, features.head_ner,
        features.length, features.head, features.first, features.last,
        features.preceding_token, features.next_token, features.governor,
        features.ancestry
    ]

    pairwise_features = [
        features.exact_match, features.head_match, features.same_speaker,
        features.alias, features.sentence_distance, features.embedding,
        features.modifier, features.tokens_contained, features.head_contained,
        features.token_distance
    ]

    # todo make sure these are exact!
    model_abs = '/Users/ryanpanos/Documents/code/cort_experiments/models/model-pair-train+dev.obj'  #OMG evil!
    # perceptron_path = 'cort.coreference.approaches.mention_ranking.RankingPerceptron'
    # extractor_path = ' cort.coreference.approaches.mention_ranking.extract_substructures'
    perceptron_path = 'cort.coreference.approaches.mention_ranking.RankingPerceptron'
    extractor_path = ' coreference.approaches.mention_ranking.extract_substructures'
    corenlp_path = '/Users/ryanpanos/Documents/code/StanfordNLP/stanford-corenlp-full-2016-10-31/'  #OMG evil!
    clusterer_path = 'cort.coreference.clusterer.all_ante'

    # logging.info("Loading model.")
    print("Loading model . ... (this takes a while) ")
    priors, weights = pickle.load(open(model_abs, "rb"))
    print("Model loaded.")

    # perceptron = import_helper.import_from_path(perceptron_path)(
    #     priors=priors,
    #     weights=weights,
    #     cost_scaling=0
    # )

    perceptron = RankingPerceptron(priors=priors,
                                   weights=weights,
                                   cost_scaling=0)

    extractor = instance_extractors.InstanceExtractor(
        # import_helper.import_from_path(extractor_path),
        extract_substructures,
        mention_features,
        pairwise_features,
        cost_functions.null_cost,
        perceptron.get_labels())

    logging.info("Reading in and preprocessing data.")
    p = pipeline.Pipeline(corenlp_path)

    testing_corpus = p.run_on_blob("corpus", text_blob)

    logging.info("Extracting system mentions.")
    for doc in testing_corpus:
        doc.system_mentions = mention_extractor.extract_system_mentions(doc)

    mention_entity_mapping, antecedent_mapping = experiments.predict(
        testing_corpus,
        extractor,
        perceptron,
        # import_helper.import_from_path(clusterer_path)
        all_ante)

    testing_corpus.read_coref_decisions(mention_entity_mapping,
                                        antecedent_mapping)

    logging.info("Write output to file.")

    output_ls = []
    for doc in testing_corpus:
        output = doc.to_simple_output()
        # my_file = codecs.open(doc.identifier + "." + args.suffix, "w", "utf-8")
        # my_file.write(output)
        print " output: \n" + output
        # my_file.close()
        output_ls.append(output)

    logging.info("Done.")

    return
Пример #7
0
#     cost_functions.null_cost,
#     perceptron.get_labels()
# )

logging.info("Reading in data.")
training_corpus = corpora.Corpus.from_file("training", codecs.open(args.input_filename, "r", "utf-8"))



logging.info("Extracting system mentions.")
dummy_counter_for_train = 0
for doc in training_corpus:
    if dummy_counter_for_train % 100 == 99:
        logging.info("We are extracting doc " + str(dummy_counter_for_train) + ": " + doc.identifier)
    dummy_counter_for_train += 1
    doc.system_mentions = mention_extractor.extract_system_mentions(doc)


# logging.info("\tVerifying attributes.")
# for doc in training_corpus:
#     doc.antecedent_decisions = {}
#     print(doc, doc.antecedent_decisions)
#     for mention in doc.system_mentions:
#         if not "antecedent" in mention.attributes.keys():
#             mention.attributes["antecedent"] = None
#         if not "set_id" in mention.attributes.keys():
#             mention.attributes["set_id"] = None
#
# logging.info("\tExtracting instances and features.")
# substructures, arc_information = extractor.extract(training_corpus)
#