예제 #1
0
	tagged_sents = tagged_corpus.tagged_posts(**kwargs)
else:
	if isinstance(tagged_corpus, IndianCorpusReader) and not fileids:
		fileids = 'hindi.pos'
	
	if fileids and fileids in tagged_corpus.fileids():
		kwargs['fileids'] = [fileids]
	
		if args.trace:
			print('using tagged sentences from %s' % fileids)
	
	tagged_sents = tagged_corpus.tagged_sents(**kwargs)

# manual simplification is needed for these corpora
if simplify_wsj_tag and args.simplify_tags and args.corpus in ['conll2000', 'switchboard']:
	tagged_sents = [[(word, simplify_wsj_tag(tag)) for (word, tag) in sent] for sent in tagged_sents]

##################
## tagged sents ##
##################

# can't trust corpus to provide valid list of sents (indian)
tagged_sents = [sent for sent in tagged_sents if sent]
nsents = len(tagged_sents)

if args.fraction == 1.0:
	train_sents = test_sents = tagged_sents
else:
	cutoff = int(math.ceil(nsents * args.fraction))
	train_sents = tagged_sents[:cutoff]
	test_sents = tagged_sents[cutoff:]
예제 #2
0
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
	kwargs = {'simplify_tags': True}
elif not simplify_wsj_tag and args.tagset:
	kwargs = {'tagset': args.tagset}
else:
	kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
	if not tag:
		continue
	
	if len(tag) > taglen:
		taglen = len(tag)
	
	if args.corpus in ['conll2000', 'switchboard'] and simplify_wsj_tag and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	
	wc += 1
	# loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)
	tag_counts[tag] += 1
	word_set.add(word)

############
## output ##
############

print('%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set), len(tag_counts)))

if args.sort == 'tag':
	sort_key = lambda tc: tc[0]
예제 #3
0
if simplify_wsj_tag and args.simplify_tags and args.corpus not in ['conll2000', 'switchboard']:
	kwargs = {'simplify_tags': True}
elif not simplify_wsj_tag and args.tagset:
	kwargs = {'tagset': args.tagset}
else:
	kwargs = {}

for word, tag in tagged_corpus.tagged_words(fileids=args.fileids, **kwargs):
	if not tag:
		continue
	
	if len(tag) > taglen:
		taglen = len(tag)
	
	if args.corpus in ['conll2000', 'switchboard'] and simplify_wsj_tag and args.simplify_tags:
		tag = simplify_wsj_tag(tag)
	
	wc += 1
	# loading corpora/treebank/tagged with ChunkedCorpusReader produces None tags
	if not isinstance(tag, basestring): tag = str(tag)
	tag_counts[tag] += 1
	word_set.add(word)

############
## output ##
############

print('%d total words\n%d unique words\n%d tags\n' % (wc, len(word_set), len(tag_counts)))

if args.sort == 'tag':
	sort_key = lambda tc: tc[0]