def test_compute_errors(self): # fake document using a named tuple document = namedtuple("Document", "annotated_mentions") doc_gold = document(self.first_cluster + self.second_cluster) doc_system = document(self.system_cluster) corpus_gold = corpora.Corpus("fake gold", [doc_gold]) corpus_system = corpora.Corpus("fake system", [doc_system]) ex = error_extractors.ErrorExtractor( corpus_gold, spanning_tree_algorithms.recall_closest, spanning_tree_algorithms.precision_system_output ) ex.add_system(corpus_system) self.assertEqual( data_structures.EnhancedSet([ (self.first_cluster[1], self.first_cluster[0]), (self.first_cluster[3], self.first_cluster[2]), (self.first_cluster[5], self.first_cluster[4]), (self.second_cluster[1], self.second_cluster[0]), (self.second_cluster[2], self.second_cluster[1]), ]), ex.get_errors()["fake system"]["recall_errors"]["all"] )
def run_on_docs(self, identifier, docs): processed_documents = [] for doc in docs: processed_documents.append(self.run_on_doc( codecs.open(doc, "r", "utf-8") )) return corpora.Corpus(identifier, processed_documents)
def run_on_blob(self, identifier, text_blob): processed_documents = [] processed_documents.append(self.run_on_doc( # codecs.open(doc, "r", "utf-8") # identifier, text_blob, use_as_blob=True )) return corpora.Corpus(identifier, processed_documents)
def do_coreference(self): testing_corpus = corpora.Corpus("input", [ self.p.run_on_doc(io.StringIO(self.txt.get("0.0", tki.END)), "input") ]) logging.info("Extracting system mentions.") for doc in testing_corpus: doc.system_mentions = mention_extractor.extract_system_mentions( doc) mention_entity_mapping, antecedent_mapping = experiments.predict( testing_corpus, self.extractor, self.perceptron, clusterer.all_ante) testing_corpus.read_coref_decisions(mention_entity_mapping, antecedent_mapping) logging.info("Visualize") for doc in testing_corpus: max_id = 0 for mention in doc.system_mentions[1:]: set_id = mention.attributes["set_id"] if set_id: max_id = max(set_id, max_id) max_id += 1 doc.annotated_mentions = [] for i, mention in enumerate(doc.system_mentions[1:]): if mention.attributes["set_id"]: mention.attributes[ "annotated_set_id"] = mention.attributes["set_id"] else: mention.attributes["annotated_set_id"] = max_id + i doc.annotated_mentions.append(mention) ex = error_extractors.ErrorExtractor( testing_corpus, spanning_tree_algorithms.recall_accessibility, spanning_tree_algorithms.precision_system_output) ex.add_system(testing_corpus) decisions = ex.get_errors() visualizer = visualization.Visualizer(decisions, "input", for_raw_input=True) visualizer.run()