tagged_sents = tagged_corpus.tagged_posts(**kwargs) else: if isinstance(tagged_corpus, IndianCorpusReader) and not fileids: fileids = 'hindi.pos' if fileids and fileids in tagged_corpus.fileids(): kwargs['fileids'] = [fileids] if args.trace: print('using tagged sentences from %s' % fileids) tagged_sents = tagged_corpus.tagged_sents(**kwargs) # manual simplification is needed for these corpora if simplify_wsj_tag and args.simplify_tags and args.corpus in ['conll2000', 'switchboard']: tagged_sents = [[(word, simplify_wsj_tag(tag)) for (word, tag) in sent] for sent in tagged_sents] ################## ## tagged sents ## ################## # can't trust corpus to provide valid list of sents (indian) tagged_sents = [sent for sent in tagged_sents if sent] nsents = len(tagged_sents) if args.fraction == 1.0: train_sents = test_sents = tagged_sents else: cutoff = int(math.ceil(nsents * args.fraction)) train_sents = tagged_sents[:cutoff] test_sents = tagged_sents[cutoff:]
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']: kwargs = {'simplify_tags': True} elif not simplify_wsj_tag and args.tagset: kwargs = {'tagset': args.tagset} else: kwargs = {} for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs): if not tag: continue if len(tag) > taglen: taglen = len(tag) if args.corpus in ['conll2000', 'switchboard'] and simplify_wsj_tag and args.simplify_tags: tag = simplify_wsj_tag(tag) wc += 1 # loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags if not isinstance(tag, basestring): tag = str(tag) tag_counts[tag] += 1 word_set.add(word) ############ ## output ## ############ print('%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set), len(tag_counts))) if args.sort == 'tag': sort_key = lambda tc: tc[0]