def build_model(fmt='binary'): print('Loading training data...') train_paths = [ find('corpora/ace_data/ace.dev'), find('corpora/ace_data/ace.heldout'), find('corpora/ace_data/bbn.dev'), find('corpora/ace_data/muc.dev') ] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print('Training...') cp = NEChunkParser(train_data) del train_data print('Loading eval data...') eval_paths = [find('corpora/ace_data/ace.eval')] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print('Evaluating...') chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = '/tmp/ne_chunker_%s.pickle' % fmt print('Saving chunker to %s...' % outfilename) with open(outfilename, 'wb') as outfile: pickle.dump(cp, outfile, -1) return cp
def build_model(fmt='binary'): print('Loading training data...') train_paths = [find('corpora/ace_data/ace.dev'), find('corpora/ace_data/ace.heldout'), find('corpora/ace_data/bbn.dev'), find('corpora/ace_data/muc.dev')] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print('Training...') cp = NEChunkParser(train_data) del train_data print('Loading eval data...') eval_paths = [find('corpora/ace_data/ace.eval')] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print('Evaluating...') chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = '/tmp/ne_chunker_%s.pickle' % fmt print('Saving chunker to %s...' % outfilename) with open(outfilename, 'wb') as outfile: pickle.dump(cp, outfile, -1) return cp
def build_event_chunking_model(): # Assemble training data, splitting token/PoS pairs train_corpus = [] for tree in load_training_data('parsed'): for sentence_tree in tree: newtree = split_tree_tokens(sentence_tree) train_corpus.append(newtree) # Train chunker chunker = EventChunkParser(train_corpus) del train_corpus # Load evaluation data, splitting token/PoS pairs eval_corpus = [] for tree in load_evaluation_data('parsed'): for sentence_tree in tree: eval_corpus.append(split_tree_tokens(sentence_tree)) # Evaluate model print 'Evaluating...' chunkscore = ChunkScore() for i, correct in enumerate(eval_corpus): guessed = chunker.parse(correct.leaves()) guessed = chunker._parse_to_tagged(guessed) chunkscore.score(correct, guessed) if i < 3: cmp_chunks(correct, guessed) print chunkscore return chunker
def build_model(fmt="binary"): print("Loading training data...") train_paths = [ find("corpora/ace_data/ace.dev"), find("corpora/ace_data/ace.heldout"), find("corpora/ace_data/bbn.dev"), find("corpora/ace_data/muc.dev"), ] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print("Training...") cp = NEChunkParser(train_data) del train_data print("Loading eval data...") eval_paths = [find("corpora/ace_data/ace.eval")] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print("Evaluating...") chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = "/tmp/ne_chunker_%s.pickle" % fmt print("Saving chunker to %s..." % outfilename) with open(outfilename, "wb") as out: pickle.dump(cp, out, -1) return cp
def build_model(fmt="binary"): print("Loading training data...") train_paths = [ find("corpora/ace_data/ace.dev"), find("corpora/ace_data/ace.heldout"), find("corpora/ace_data/bbn.dev"), find("corpora/ace_data/muc.dev"), ] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print("Training...") cp = NEChunkParser(train_data) del train_data print("Loading eval data...") eval_paths = [find("corpora/ace_data/ace.eval")] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print("Evaluating...") chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = "/tmp/ne_chunker_{0}.pickle".format(fmt) print("Saving chunker to {0}...".format(outfilename)) with open(outfilename, "wb") as outfile: pickle.dump(cp, outfile, -1) return cp
def evaluate(self, gold): """ Score the accuracy of the chunker against the gold standard. Remove the chunking the gold standard text, rechunk it using the chunker, and return a ``ChunkScore`` object reflecting the performance of this chunk peraser. :type gold: list(Tree) :param gold: The list of chunked sentences to score the chunker on. :rtype: ChunkScore """ chunkscore = ChunkScore() for correct in gold: chunkscore.score(correct, self.parse(correct.leaves())) return chunkscore
def compare(self, ann2, labels=None): """ Estimates the accuracy of annotation/prediction assuming the current Annotation is the gold standard :return: NLTK ChinkScore Object """ if labels is None: labels = tuple(value for value in self.labels if value in ann2.labels) self_nltk = nltk.chunk.conllstr2tree(self.to_iob()[1], chunk_types=labels) ann2_nltk = nltk.chunk.conllstr2tree(ann2.to_iob()[1], chunk_types=labels) chunk_score = ChunkScore() chunk_score.score(self_nltk, ann2_nltk) return chunk_score
def make_tree(tree): if isinstance(tree, tuple) and len(tree) > 2: return ImmutableTree(tree[0], [make_tree(tree[i]) for i in range(2, len(tree))]) else: return tree if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, "Usage: %s grammar.pickle" % (sys.argv[0]) exit(1) (grammar, sentence_tags) = pickle.load(open(sys.argv[1])) score = ChunkScore() for (gold, s) in list(sentences())[0:10]: print ' '.join(s) parses = list(parse(s, grammar, sentence_tags)) if parses: guess = make_tree(max(parses, key=operator.itemgetter(1))) print gold print guess score.score(gold, guess) print 'Accuracy:', score.accuracy() print 'Precision:', score.precision() print 'Recall:', score.recall() print 'F Measure:', score.f_measure()
if s not in parse(f, grammar): print "[ERROR] " + format(f) def make_tree(tree): if isinstance(tree, tuple) and len(tree) > 2: return ImmutableTree(tree[0], [make_tree(tree[i]) for i in range(2, len(tree))]) else: return tree if __name__ == "__main__": if len(sys.argv) != 2: print >> sys.stderr, "Usage: %s grammar.pickle"%(sys.argv[0]) exit(1) (grammar,sentence_tags) = pickle.load(open(sys.argv[1])) score = ChunkScore() for (gold, s) in list(sentences())[0:10]: print ' '.join(s) parses = list(parse(s, grammar, sentence_tags)) if parses: guess = make_tree(max(parses, key=operator.itemgetter(1))) print gold print guess score.score(gold, guess) print 'Accuracy:', score.accuracy() print 'Precision:', score.precision() print 'Recall:', score.recall() print 'F Measure:', score.f_measure()