if args.trace: print('loading %s' % args.corpus) ############## ## counting ## ############## wc = 0 tag_counts = collections.defaultdict(int) iob_counts = collections.defaultdict(int) tag_iob_counts = collections.defaultdict(lambda: collections.defaultdict(int)) word_set = set() for obj in chunked_corpus.chunked_words(): if isinstance(obj, Tree): label = node_label(obj) iob_counts[label] += 1 for word, tag in obj.leaves(): wc += 1 word_set.add(word) tag_counts[tag] += 1 tag_iob_counts[tag][label] += 1 else: word, tag = obj wc += 1 word_set.add(word) tag_counts[tag] += 1 ############ ## output ##
chunked_sents = chunked_sents[:cutoff] print(chunker.evaluate(chunked_sents)) print('\n') if args.trace: print('analyzing chunker coverage of %s with %s\n' % (args.corpus, chunker.__class__.__name__)) iobs_found = collections.defaultdict(int) sents = corpus.sents() if args.fraction != 1.0: cutoff = int(math.ceil(len(sents) * args.fraction)) sents = sents[:cutoff] for sent in sents: tree = chunker.parse(tagger.tag(sent)) for child in tree.subtrees(lambda t: node_label(t) != 'S'): iobs_found[node_label(child)] += 1 iobs = iobs_found.keys() justify = max(7, *[len(iob) for iob in iobs]) print('IOB'.center(justify) + ' Found ') print('='*justify + ' =========') for iob in sorted(iobs): print(' '.join([iob.ljust(justify), str(iobs_found[iob]).rjust(9)])) print('='*justify + ' =========')