def treebank_chunk_tagger_demo(): from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import PlaintextCorpusReader from nltk_contrib.coref.util import TreebankChunkTaggerCorpusReader state_union = LazyCorpusLoader( 'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt') state_union = TreebankChunkTaggerCorpusReader(state_union) print 'Treebank chunker demo...' print 'Chunked sentences:' for sent in state_union.chunked_sents()[500:505]: print sent print print print 'Parsed sentences:' for tree in state_union.parsed_sents()[500:505]: print tree print print
import codecs import pickle from nltk.corpus.reader.conll import ConllChunkCorpusReader from nltk.corpus.util import LazyCorpusLoader from nltk.stem.wordnet import WordNetLemmatizer from sentence import load_conll from case.MosesTrueCaser import MosesTrueCaser lemmatizer = WordNetLemmatizer() truecaser = MosesTrueCaser(open('models/truecase/truecase-model.en')) m = pickle.load(open("models/eng.pickle")) gazetteer = None #m.feature_generator.feature_sets[-1] corpus = LazyCorpusLoader('conll2003', ConllChunkCorpusReader, '.*\.(test|train).*', ('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8') test = load_conll(corpus.chunked_sents('eng.testa'), lemmatizer, truecaser, gazetteer=gazetteer) out, acc = m.test(test) codecs.open('test.txt', 'w', encoding='utf-8').write(out)