Exemplo n.º 1
0
    def test_compute_errors(self):
        # fake document using a named tuple
        document = namedtuple("Document", "annotated_mentions")
        doc_gold = document(self.first_cluster + self.second_cluster)
        doc_system = document(self.system_cluster)
        corpus_gold = corpora.Corpus("fake gold", [doc_gold])
        corpus_system = corpora.Corpus("fake system", [doc_system])

        ex = error_extractors.ErrorExtractor(
            corpus_gold,
            spanning_tree_algorithms.recall_closest,
            spanning_tree_algorithms.precision_system_output
        )

        ex.add_system(corpus_system)

        self.assertEqual(
            data_structures.EnhancedSet([
                (self.first_cluster[1], self.first_cluster[0]),
                (self.first_cluster[3], self.first_cluster[2]),
                (self.first_cluster[5], self.first_cluster[4]),
                (self.second_cluster[1], self.second_cluster[0]),
                (self.second_cluster[2], self.second_cluster[1]),
            ]),
            ex.get_errors()["fake system"]["recall_errors"]["all"]
        )
Exemplo n.º 2
0
    def run_on_docs(self, identifier, docs):
        processed_documents = []

        for doc in docs:
            processed_documents.append(self.run_on_doc(
                codecs.open(doc, "r", "utf-8")
            ))

        return corpora.Corpus(identifier, processed_documents)
Exemplo n.º 3
0
    def run_on_blob(self, identifier, text_blob):
        processed_documents = []

        processed_documents.append(self.run_on_doc( # codecs.open(doc, "r", "utf-8")
            # identifier,
            text_blob,
            use_as_blob=True
        ))

        return corpora.Corpus(identifier, processed_documents)
Exemplo n.º 4
0
    def do_coreference(self):
        testing_corpus = corpora.Corpus("input", [
            self.p.run_on_doc(io.StringIO(self.txt.get("0.0", tki.END)),
                              "input")
        ])

        logging.info("Extracting system mentions.")
        for doc in testing_corpus:
            doc.system_mentions = mention_extractor.extract_system_mentions(
                doc)

        mention_entity_mapping, antecedent_mapping = experiments.predict(
            testing_corpus, self.extractor, self.perceptron,
            clusterer.all_ante)

        testing_corpus.read_coref_decisions(mention_entity_mapping,
                                            antecedent_mapping)

        logging.info("Visualize")

        for doc in testing_corpus:
            max_id = 0

            for mention in doc.system_mentions[1:]:
                set_id = mention.attributes["set_id"]

                if set_id:
                    max_id = max(set_id, max_id)

            max_id += 1

            doc.annotated_mentions = []

            for i, mention in enumerate(doc.system_mentions[1:]):
                if mention.attributes["set_id"]:
                    mention.attributes[
                        "annotated_set_id"] = mention.attributes["set_id"]
                else:
                    mention.attributes["annotated_set_id"] = max_id + i
                doc.annotated_mentions.append(mention)

        ex = error_extractors.ErrorExtractor(
            testing_corpus, spanning_tree_algorithms.recall_accessibility,
            spanning_tree_algorithms.precision_system_output)

        ex.add_system(testing_corpus)

        decisions = ex.get_errors()

        visualizer = visualization.Visualizer(decisions,
                                              "input",
                                              for_raw_input=True)

        visualizer.run()