示例#1
0
def treebank_chunk_tagger_demo():
    from nltk.corpus.util import LazyCorpusLoader    
    from nltk.corpus.reader import PlaintextCorpusReader
    from nltk_contrib.coref.util import TreebankChunkTaggerCorpusReader
    
    state_union = LazyCorpusLoader(
        'state_union', PlaintextCorpusReader, r'(?!\.svn).*\.txt')
    state_union = TreebankChunkTaggerCorpusReader(state_union)

    print 'Treebank chunker demo...'
    print 'Chunked sentences:'
    for sent in state_union.chunked_sents()[500:505]:
        print sent
        print
    print
    print 'Parsed sentences:'
    for tree in state_union.parsed_sents()[500:505]:
        print tree
        print
    print
示例#2
0
import codecs
import pickle
from nltk.corpus.reader.conll import ConllChunkCorpusReader
from nltk.corpus.util import LazyCorpusLoader
from nltk.stem.wordnet import WordNetLemmatizer
from sentence import load_conll
from case.MosesTrueCaser import MosesTrueCaser

lemmatizer = WordNetLemmatizer()
truecaser  = MosesTrueCaser(open('models/truecase/truecase-model.en'))

m = pickle.load(open("models/eng.pickle"))

gazetteer = None #m.feature_generator.feature_sets[-1]

corpus = LazyCorpusLoader('conll2003', ConllChunkCorpusReader, '.*\.(test|train).*', ('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8')
test    = load_conll(corpus.chunked_sents('eng.testa'), lemmatizer, truecaser, gazetteer=gazetteer)

out, acc = m.test(test)

codecs.open('test.txt', 'w', encoding='utf-8').write(out)