Exemplo n.º 1
0
 def train_classifier_tagger(self):
     from nltk.corpus import conll2000 
     from nltk.tag.sequential import ClassifierBasedPOSTagger    
     test_sents  = conll2000.tagged_sents()[9500:]          
     train_sents = conll2000.tagged_sents()
     print "training class"
     tagger = ClassifierBasedPOSTagger(train=train_sents)
     #print "evaluating"
     #print tagger.evaluate(test_sents)
     #print "tag"        
     return tagger
def kFoldCV(fold):
    if fold <= 1:
        print("please enter fold >1!")
        return
    else:
        l = int(len(conll2000.tagged_sents()) / fold)
        for i in range(fold):
            refreshPrint(str(fold) + ' fold cross-validation: preparing for ' + str(i + 1) + ' loop\'s tagger model...')  
            left = l * i
            right = left + l
            testSents = conll2000.tagged_sents()[left:right]
            trainSents = conll2000.tagged_sents()[:left] + conll2000.tagged_sents()[right:]
            trainTags = set(k[1] for k in chain.from_iterable(trainSents)) | {'<s>', '<\s>'}
            trainWords = set([a for a, b in set(chain.from_iterable(trainSents))])
            allTrainWords = list(chain.from_iterable(trainSents))
            ##################trainsition Probablilities       
            tModel = defaultdict(lambda: defaultdict(lambda: 0))
            
            for x in trainTags:
                for y in trainTags:
                    for z in trainTags:
                        tModel[(x, y)][z] = 1
            
            for sentence in trainSents:
                sentence = [('', '<s>'), ('', '<s>')] + sentence + [('', '<\s>')]
                for x, y, z in trigrams([k[1] for k in sentence]):
                    tModel[(x, y)][z] += 1
            
            for xy in tModel:
                totalCount = float(sum(tModel[xy].values()))
                for z in tModel[xy]:
                    tModel[xy][z] /= totalCount
            
            ########################emission Probablities
            eModel = defaultdict(lambda: defaultdict(lambda: 0))
            
            for x in trainTags:
                for z in trainWords:
                    eModel[x][z] = 1
            
            for z, x in allTrainWords:
                eModel[x][z] += 1
            
            for x in eModel:
                total_count = float(sum(eModel[x].values()))
                for z in eModel[x]:
                    eModel[x][z] /= total_count
            
            wrongCount = [0]
            refreshPrint('                                                                                                              ')
            print(str(i + 1) + ' loop accuracy: ' + str(HMMAccuracy(testSents, trainWords, trainTags, eModel, tModel, wrongCount)*100) + '%')
def getData(corpus="brown", categories=""):
    if corpus == "brown":
        if categories != "":
            return brown.tagged_sents(tagset='universal',
                                      categories=categories)

        return brown.tagged_sents(tagset='universal')
    elif corpus == "treebank":
        return treebank.tagged_sents(tagset='universal')
    elif corpus == "nps_chat":
        #Dialogue dataset
        data = []
        posts = nps_chat.posts()
        words = nps_chat.tagged_words(tagset='universal')

        index = 0
        for sent in posts:
            data.append(words[index:index + len(sent)])
            index += len(sent)
        return data

    elif corpus == "conll2000":
        return conll2000.tagged_sents(tagset='universal')

    return brown.tagged_sents(tagset='universal')
Exemplo n.º 4
0
 def __init__(self):
     try:
         tagger = cPickle.load(open("nerdb_tagger.pkl"))
     except IOError:
         print "failed to load nerdb_tagger, recreating..."
         train_sents = conll2000.tagged_sents() + brown.tagged_sents()
         tagger = nltk.DefaultTagger("NN")
         tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
         tagger = nltk.BigramTagger(train_sents, backoff=tagger)
         tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
         cPickle.dump(tagger, open("nerdb_tagger.pkl", "w"))
         print "done"
     try:
         chunker = cPickle.load(open("nerdb_chunker.pkl"))
     except IOError:
         print "failed to load nerdb_chunker, recreating..."
         train_sents = conll2000.chunked_sents()
         chunker = ConsecutiveNPChunker(tagger, train_sents)
         cPickle.dump(chunker, open("nerdb_chunker.pkl", "w"))
         print "done"
     self.chunker = chunker
     self.people = [line.strip().split(" ", 1) for line in open("actors_index.txt").readlines()]
     self.people += [line.strip().split(" ", 1) for line in open("actresses_index.txt").readlines()]
     self.movies = [line.strip().split(" ", 1) for line in open("title_index.txt").readlines()]
     self.entity_types = {"PERSON": self.people, "MOVIE": self.movies}
     self.numbers = eval(open("numbers.txt").read())
Exemplo n.º 5
0
def select_sents(x):
    return {
        'brown_universal':
        brown.tagged_sents(tagset='universal'),  # Accuracy: 95.12%
        'brown': brown.tagged_sents(),  # Accuracy: 93.66%
        'conll2000_universal':
        conll2000.tagged_sents(tagset='universal'),  # Accuracy: 95.63%
        'conll2000': conll2000.tagged_sents(),  # Accuracy: 94.94%
        'conll2002': conll2002.tagged_sents(),  # Accuracy: 91.53%
        'alpino': alpino.tagged_sents(),  # Accuracy: 88.79%
        'dependency_treebank':
        dependency_treebank.tagged_sents(),  # Accuracy: 90.79%
        'treebank': treebank.tagged_sents(),  # Accuracy: 91.44%
        'indian': indian.tagged_sents(),  # Accuracy: 64.41%
        'else': []  # in case of an unavailable corpus
    }.get(x, 'else')
Exemplo n.º 6
0
Arquivo: NERDb.py Projeto: jamt/IMDBot
 def __init__(self):
   try:
     tagger = cPickle.load(open('nerdb_tagger.pkl'))
   except IOError:
     print 'failed to load nerdb_tagger, recreating...'
     train_sents = conll2000.tagged_sents() + brown.tagged_sents()
     tagger = nltk.DefaultTagger('NN')
     tagger = nltk.UnigramTagger(train_sents, backoff=tagger)
     tagger = nltk.BigramTagger(train_sents, backoff=tagger)
     tagger = nltk.TrigramTagger(train_sents, backoff=tagger)
     cPickle.dump(tagger, open('nerdb_tagger.pkl', 'w'))
     print 'done'
   try:
     chunker = cPickle.load(open('nerdb_chunker.pkl'))
   except IOError:
     print 'failed to load nerdb_chunker, recreating...'
     train_sents = conll2000.chunked_sents()
     chunker = ConsecutiveNPChunker(tagger, train_sents)
     cPickle.dump(chunker, open('nerdb_chunker.pkl', 'w'))
     print 'done'
   self.chunker = chunker
   self.people = [line.strip().split(" ", 1) for line in open('actors_index.txt').readlines()]
   self.people += [line.strip().split(" ", 1) for line in open('actresses_index.txt').readlines()]
   self.movies = [line.strip().split(" ", 1) for line in open('title_index.txt').readlines()]
   self.entity_types = {'PERSON' : self.people, 'MOVIE' : self.movies}
def get_noun_phrases_and_named_entities(file_name, start_index, end_index):

    sentences = conll2000.sents(file_name)
    noun_phrase_sentences = conll2000.chunked_sents(file_name, chunk_types=['NP'])
    pos_tagged_sentences = conll2000.tagged_sents(file_name)

    sentences = sentences[start_index:end_index]
    pos_tagged_sentences = pos_tagged_sentences[start_index:end_index]
    noun_phrase_sentences = noun_phrase_sentences[start_index:end_index]

    # Extacting mentions.
    words = []
    cnt = 0
    for sent in sentences:
        cnt += 1
        for word in sent:
            words.append((word, cnt))

    noun_phrases = []
    for sent in noun_phrase_sentences:
        noun_phrases += nltk.chunk.tree2conlltags(sent)

    named_entities = []
    for tagged_sent in pos_tagged_sentences:
        tree = nltk.chunk.ne_chunk(tagged_sent)
        named_entities += nltk.chunk.tree2conlltags(tree)

    return (words, noun_phrases, named_entities)
Exemplo n.º 8
0
def english():
    from collective.classification.data.downloader import\
        downloadNLTKConll2000Corpus
    downloadNLTKConll2000Corpus()
    from nltk.corpus import conll2000
    conll2000_sents = conll2000.tagged_sents()
    tagger = BrillTrigramTagger()
    tagger.train(conll2000_sents)
    dump(tagger.tagger, "english_tagger.pickle")
def english():
    from collective.classification.data.downloader import\
        downloadNLTKConll2000Corpus
    downloadNLTKConll2000Corpus()
    from nltk.corpus import conll2000
    conll2000_sents = conll2000.tagged_sents()
    tagger = BrillTrigramTagger()
    tagger.train(conll2000_sents)
    dump(tagger.tagger, "english_tagger.pickle")
Exemplo n.º 10
0
 def traintest_uni_bi_tri_tagger(self):
     from nltk.tag import DefaultTagger,UnigramTagger, BigramTagger, TrigramTagger
     from nltk.corpus import conll2000, treebank    
     test_sents  = conll2000.tagged_sents()[8000:]          
     train_sents = treebank.tagged_sents()[3000:]
     print 'trainging trigramter with backoff'
     backoff = DefaultTagger('NN')
     tagger = self.backoff_tagger(train_sents, [UnigramTagger, BigramTagger,TrigramTagger], backoff=backoff)
     print 'evaluation trigram with backoff'        
     print tagger.evaluate(test_sents)
     print 'tagging'
     print tagger.tag(word_tokenize("This is a test. This should be faster than nothing. How can I rent a car in the next twelve hours? "))
Exemplo n.º 11
0
   def evaluate(self):
      '''run tests on conll2000 and treebank data'''

      test = treebank.tagged_sents()[:100]
      treebank_result = (100*self.classifier.evaluate(test))

      test = conll2000.tagged_sents()[:100]
      conll2000_result = (100*self.classifier.evaluate(test))

      test = brown.tagged_sents()[int(len(brown.tagged_sents())*0.8):]
      brown_result = (100*self.classifier.evaluate(test))

      return (treebank_result, conll2000_result, brown_result)
Exemplo n.º 12
0
def main():

    # 1. a)
    bts = brown.tagged_sents(categories=u'news', tagset=u'universal')
    brown_size = int(len(bts) * 0.9)
    brown_training = bts[:brown_size]
    brown_test = bts[brown_size:]
    tagset = list(mapping._UNIVERSAL_TAGS)

    simple_tagger = a1.BigramTagger()
    simple_tagger.train(brown_training)

    #1. b)
    test_tagging(simple_tagger)

    #1. c)
    print u'Simple bigram tagger'
    print_accuracy(simple_tagger, brown_test)

    #1. d)
    print_confusion_matrix(simple_tagger, brown_test, tagset)

    #2. a)
    default_tagger = a2.DefaultTagger(u'NN')

    unigram_tagger = a2.UnigramTagger(backoff_tagger=default_tagger)
    unigram_tagger.train(brown_training)

    bigram_tagger = a2.BigramTagger(backoff_tagger=unigram_tagger)
    bigram_tagger.train(brown_training)

    print u'Bigram tagger with backoffs'
    print_accuracy(bigram_tagger, brown_test)

    #2. b)
    other_cat = brown.tagged_sents(categories='romance', tagset='universal')
    print u'Simple bigram tagger, other genre'
    print_accuracy(simple_tagger, other_cat)
    print u'Backoff tagger, other genre'
    print_accuracy(bigram_tagger, other_cat)

    conll_sents = conll2000.tagged_sents(tagset=u'universal')

    print u'Simple bigram tagger, other corpus'
    print_accuracy(simple_tagger, conll_sents)
    print u'Backoff tagger, other corpus'
    print_accuracy(bigram_tagger, conll_sents)
Exemplo n.º 13
0
 def train(self):
     start = time.time()
     templates = [
         brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,1)),
         brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (2,2)),
         brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,2)),
         brill.SymmetricProximateTokensTemplate(brill.ProximateTagsRule, (1,3)),
         brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,1)),
         brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (2,2)),
         brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,2)),
         brill.SymmetricProximateTokensTemplate(brill.ProximateWordsRule, (1,3)),
         brill.ProximateTokensTemplate(brill.ProximateTagsRule, (-1, -1), (1,1)),
         brill.ProximateTokensTemplate(brill.ProximateWordsRule, (-1, -1), (1,1))
     ]
     self.train_sents = conll2000.tagged_sents('train.txt')
     word_patterns = [
         (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
         (r'.*ould$', 'MD'),
         (r'.*ing$', 'VBG'),
         (r'.*ed$', 'VBD'),
         (r'.*ness$', 'NN'),
         (r'.*ment$', 'NN'),
         (r'.*ful$', 'JJ'),
         (r'.*ious$', 'JJ'),
         (r'.*ble$', 'JJ'),
         (r'.*ic$', 'JJ'),
         (r'.*ive$', 'JJ'),
         (r'.*ic$', 'JJ'),
         (r'.*est$', 'JJ'),
         (r'^a$', 'PREP'),
     ]
     raubt_tagger = self.backoff_tagger(self.train_sents, [nltk.tag.AffixTagger,
                                                           nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger],
                                                           backoff=nltk.tag.RegexpTagger(word_patterns))
     trainer = brill.FastBrillTaggerTrainer(raubt_tagger, templates,deterministic=True)
     self.tagger = trainer.train(self.train_sents,max_rules=100, min_score=3)
     self.save2Pickle(self.tagger)
     print 'Time: ' + str(time.time()-start)
Exemplo n.º 14
0
from nltk.corpus import brown, conll2000, alpino, floresta, gutenberg

from nltk.tag import hmm
from nltk.util import unique_list
from nltk.probability import *
from nltk import ConditionalProbDist
from nltk import ConditionalFreqDist
from collections import Counter

from HMM import *

# Load the Training and Test Sentences
print("Downloading Training Sentences from Corpus")
trainingSentences_brown = brown.tagged_sents(tagset="universal")[:10000]
trainingSentences_conll2000 = conll2000.tagged_sents()[:10000]
trainingSentences_alpino = alpino.tagged_sents()[:10000]
trainingSentences_floresta = floresta.tagged_sents()[:10000]
print "Done!"

print("Downloading Test Sentences from Corpus")
testSentences_brown = brown.tagged_sents(tagset="universal")[10000:10500]
testSentences_conll2000 = conll2000.tagged_sents()[10000:10500]
testSentences_alpino = alpino.tagged_sents()[10000:10500]
testSentences_floresta = floresta.tagged_sents()[10000:10500]
print "Done!"


# Extracts words and tags from Sentences
def extractWords_and_Tags(sentences):
    words = {}
Exemplo n.º 15
0
    nltk.data.find('corpora/conll2002')
except:
    nltk.download('conll2002')
try:
    nltk.data.find('taggers/averaged_perceptron_tagger')
except:
    nltk.download('averaged_perceptron_tagger')

from nltk.corpus import wordnet as wn
from nltk.corpus import treebank, conll2000, brown, conll2002
from nltk import DefaultTagger, UnigramTagger, BigramTagger

wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()

# The code below trains bigram part of speech tagger from various datasets.
train_sents = treebank.tagged_sents() + brown.tagged_sents() + conll2000.tagged_sents() + conll2002.tagged_sents()
edited_train = []
for sent in train_sents:
    edited_train.append([(word.lower(),tag) for (word,tag) in sent])
t0 = DefaultTagger(None)
et1 = UnigramTagger(edited_train, backoff = t0)
et2 = BigramTagger(edited_train, backoff = et1)

# The function below converts bigram pos to wordnet pos for lemmatization
def penn_to_wn(tag):
    nltk_wn_pos = {'J':wn.ADJ,'V':wn.VERB,'N':wn.NOUN,'R':wn.ADV}
    try:
        return nltk_wn_pos[tag[0]]
    except:
        return None
from nltk import word_tokenize
from nltk.tag import PerceptronTagger
from nltk.corpus import conll2000 as cn
import pickle
import time

train = cn.tagged_sents("train.txt")
test = cn.tagged_sents("test.txt")

pt = PerceptronTagger(load=False)
sts=int(time.time())
pt.train(list(train),nr_iter=10)

fts=int(time.time())
pts=fts-sts
print pts

f = open('ptagger.pickle', 'wb')
pickle.dump(pt, f)
f.close()

Exemplo n.º 17
0
import nltk
nltk.download('conll2000')
from nltk.corpus import conll2000
x = (conll2000.tagged_sents())
for i in range(5):
    print(x[i])
Exemplo n.º 18
0
		ex: 'To read'
		output: (read: VB)
		which is correct output.

		So I need to research bit more for this.
'''

from nltk.tag import RegexpTagger, untag, UnigramTagger, BigramTagger, TrigramTagger, DefaultTagger, AffixTagger, RegexpTagger
from nltk.tag.brill_trainer import BrillTaggerTrainer
from nltk.corpus import brown, treebank, conll2000
from tag_util import backoff_tagger, train_brill_tagger
import pickle

# train_sents = brown.tagged_sents(categories=['news'])[:40000]
# test_sents = brown.tagged_sents(categories=['news']) [40000:50000]
train_sents = conll2000.tagged_sents()
# some regex pattern that will be used for the RegexpTagger
regex_pattern = [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*ould$', 'MD'),
                 (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*ness$', 'NN'),
                 (r'.*ment$', 'NN'), (r'.*ful$', 'JJ'), (r'.*ious$', 'JJ'),
                 (r'.*ble$', 'JJ'), (r'.*ic$', 'JJ'), (r'.*ive$', 'JJ'),
                 (r'.*ic$', 'JJ'), (r'.*est$', 'JJ'), (r'mad', 'JJ'),
                 (r'^a$', 'PREP')]

initial_tagger = backoff_tagger(
    train_sents, [AffixTagger, UnigramTagger, BigramTagger, TrigramTagger],
    backoff=RegexpTagger(regex_pattern))

# Training the Brill Tagger
brill_tagger = train_brill_tagger(initial_tagger, train_sents)
#print brill_tagger.evaluate(test_sents)
Exemplo n.º 19
0
import nltk
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk.corpus import nps_chat
from nltk.corpus import conll2000
from nltk.corpus import ConllCorpusReader

brown_fiction = list(
    brown.tagged_sents(categories='fiction', tagset='universal'))
brown_reviews = list(
    brown.tagged_sents(categories='reviews', tagset='universal'))
conll = list(conll2000.tagged_sents(tagset='universal'))
tree = list(treebank.tagged_sents(tagset='universal'))

columntypes = ['words', 'pos']
twitter_corpus = ConllCorpusReader("resources/",
                                   "twitter.conll",
                                   columntypes,
                                   tagset='en-tweet')
twitter = list(twitter_corpus.tagged_sents(tagset='universal'))

nps_raw = nps_chat.tagged_posts(tagset='universal')
nps = []
for post in nps_raw:
    post_clean = [sub for sub in post if sub[0]]
    nps.append(post_clean)
Exemplo n.º 20
0
from nltk.corpus import conll2000, brown

f = open('train.txt', 'w')

total = 1000
for sent in conll2000.tagged_sents():
    for word, tag in sent:
        #f.write(word + '\t' + tag + '\n')
        f.write(word + '\t' + tag + '\n')

    total -= 1
    if total == 0:
        break

print "generated train.txt"

f = open('test.txt', 'w')
total = 100
for sent in conll2000.tagged_sents()[1001:1105]:
    for word, tag in sent:
        #f.write(word + '\t' + tag + '\n')
        f.write(word + '\t' + tag + '\n')

    total -= 1
    if total == 0:
        break

print "generated test.txt"
Exemplo n.º 21
0
    index = int(sys.argv[1])
    tagset = int(sys.argv[2])
    if index == 1 and tagset == 1:
        sents = brown.tagged_sents()
    elif index == 1 and tagset == 2:
        sents = brown.tagged_sents(tagset='universal')
    elif index == 2 and tagset == 1:
        sents = treebank.tagged_sents()
    elif index == 2 and tagset == 2:
        sents = treebank.tagged_sents(tagset='universal')
    elif index == 3 and tagset == 1:
        sents = masc_tagged.tagged_sents()
    elif index == 3 and tagset == 2:
        sents = masc_tagged.tagged_sents(tagset='universal')
    elif index == 4 and tagset == 1:
        sents = conll2000.tagged_sents()
    elif index == 4 and tagset == 2:
        sents = conll2000.tagged_sents(tagset='universal')
    else:
        print "Usage: python HMM.py <corpus_index> <tagset_index>"
        print "Corpus:          Tagset: "
        print "1. brown           1. Default"
        print "2. treebank        2. Universal"
        print "3. masc_tagged"
        print "4. conll2000"
        exit(0)


# Process training set
def process_training_set():
    # Define size of training set
Exemplo n.º 22
0
# 		data.write("\n")
# 	else:
# 		data2.write("\n")
# 	res.write("\n")
	
# 	amount -= 1
# 	amount2 -= 1
# 	if amount2 == 0:
# 		break


#!/usr/bin/env python3
tagged_sentences = []

from nltk.corpus import conll2000 as corpus
tagged_sentences += corpus.tagged_sents(tagset='universal')

import nltk
untagged_sentences += corpus.sents()

data = open("testdata.txt", "wt")
res = open("answer.txt", "wt")
control = open("control.txt", "wt")

tru_amount = len(tagged_sentences)
amount = tru_amount
print("[Data] Extracted {} out of {} ({:.2f}%)".format(amount, tru_amount, amount/tru_amount*100))


for sentences, untagged in zip(tagged_sentences, untagged_sentences):
	# if "" in [a[0] for a in sentences]:
Exemplo n.º 23
0
import nltk
import itertools
from nltk import word_tokenize, pos_tag
from nltk.corpus import brown, treebank, conll2000

## Initialize Input Text and Corpus
text = "My friend and I often enjoy working in the coffee house"
#text = "I usually hang out with cute friends and watch national football league every Saturday night"
corpus1 = brown.tagged_sents(tagset='universal')
corpus2 = conll2000.tagged_sents(tagset='universal')
corpus3 = treebank.tagged_sents(tagset='universal')
corpus4 = brown.tagged_sents(categories=['news'], tagset='universal')
corpus5 = brown.tagged_sents(categories=['reviews'], tagset='universal')
corpus6 = brown.tagged_sents(categories=['romance'], tagset='universal')
corpus = list(itertools.chain(corpus1))

## Calculate state transition prob and word prob
word_list = []
for sentence in corpus:
    word_list.append(("_BEGIN_", "_begin_"))
    word_list.extend([(tag, word) for (word, tag) in sentence])
    word_list.append(("_END_", "_end_"))
tags = [tag for (tag, word) in word_list]
tag_list = []
for i in range(len(tags) - 1):
    tag_list.append([tags[i], tags[i + 1]])

# words prob
word_freq = nltk.ConditionalFreqDist(word_list)
B = nltk.ConditionalProbDist(word_freq, nltk.MLEProbDist)
# state transition prob
Exemplo n.º 24
0

from nltk.corpus import conll2000

test_sents = conll2000.chunked_sents('test.txt', chunk_types='NP')
train_sents = conll2000.chunked_sents('train.txt', chunk_types='NP')
# 训练用的数据格式
train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
              for sent in train_sents]
print(train_data[0])
# 评估unigram标注器的性能
unigram_chunker = UnigramChunker(train_sents)
print(unigram_chunker.evaluate(test_sents))

# 直接对parse()函数进行测试
tmp_sents = conll2000.tagged_sents('test.txt')
print(tmp_sents[0])
print(unigram_chunker.parse(tmp_sents[0]))

# 一元标注器对于标签的标注结果
postags = sorted(
    set(pos for sent in train_sents for (word, pos) in sent.leaves()))
print(unigram_chunker.tagger.tag(postags))


# 试着自己建立一个二元标注器
class BigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)
corp_words_tagged = [
    brown.tagged_words(tagset=CONST_tagset),
    nps_chat.tagged_words(tagset=CONST_tagset),
    conll2000.tagged_words(tagset=CONST_tagset),
    treebank.tagged_words(tagset=CONST_tagset)
]
corp_words_untagged = [
    brown.words(),
    nps_chat.words(),
    conll2000.words(),
    treebank.words()
]
corp_sents_tagged = [
    brown.tagged_sents(tagset=CONST_tagset),
    nps_chat.tagged_posts(tagset=CONST_tagset),
    conll2000.tagged_sents(tagset=CONST_tagset),
    treebank.tagged_sents(tagset=CONST_tagset)
]
corp_sents_untagged = [
    brown.sents(),
    nps_chat.posts(),
    conll2000.sents(),
    treebank.sents()
]

# language tool spell checker
lt_check = language_check.LanguageTool('en-US')

# pyenchant spell checker
# pe_check = enchant.Dict('en_US')
Exemplo n.º 26
0
#brown_romance_cutoff = len(brown_romance) * 2 / 3
#brown_fiction = brown.tagged_sents(categories=['fiction'],simplify_tags=True)
#brown_fiction_cutoff = len(brown_fiction) * 2 / 3
#brown_belles_lettres = brown.tagged_sents(categories=['belles_lettres'],simplify_tags=True)
#brown_belles_lettres_cutoff = len(brown_belles_lettres) * 2 / 3



#brown_train = list(itertools.chain(brown_reviews[:brown_reviews_cutoff],
#	brown_lore[:brown_lore_cutoff], brown_romance[:brown_romance_cutoff],brown_fiction[:brown_fiction_cutoff],
#        brown_belles_lettres[brown_belles_lettres_cutoff:]))
#brown_test = list(itertools.chain(brown_reviews[brown_reviews_cutoff:],
#	brown_lore[brown_lore_cutoff:], brown_romance[brown_romance_cutoff:],brown_fiction[:brown_fiction_cutoff],
#        brown_belles_lettres[brown_belles_lettres_cutoff:]))

conll_train = conll2000.tagged_sents('train.txt')
conll_test = conll2000.tagged_sents('test.txt')

treebank_cutoff = len(treebank.tagged_sents()) * 2 / 3
treebank_train = treebank.tagged_sents()[:treebank_cutoff]
treebank_test = treebank.tagged_sents()[treebank_cutoff:]

train_sents = conll_train +  treebank_train
test_sents  = conll_test  + treebank_test
#train_sents = brown_train
#test_sents = treebank_test

#print test_sents
raubt_tagger = backoff_tagger(train_sents, [nltk.tag.AffixTagger,
    nltk.tag.UnigramTagger, nltk.tag.BigramTagger, nltk.tag.TrigramTagger],
    backoff=nltk.tag.DefaultTagger('NN'))