def get_data():
    """
    Split the Brown and Nps Chat corpus into 4 training sets and 4 test sets
    :return:
    """
    data = {
        "train_brown50": train_test_split(brown.tagged_sents(), 0.5)[0],
        "test_brown50": train_test_split(brown.tagged_sents(), 0.5)[1],
        "train_brown90": train_test_split(brown.tagged_sents(), 0.9)[0],
        "test_brown10": train_test_split(brown.tagged_sents(), 0.9)[1],
        "train_nps50": train_test_split(nps_chat.tagged_posts(), 0.9)[0],
        "test_nps50": train_test_split(nps_chat.tagged_posts(), 0.9)[1],
        "train_nps90": train_test_split(nps_chat.tagged_posts(), 0.9)[0],
        "test_nps10": train_test_split(nps_chat.tagged_posts(), 0.9)[1]
    }
    return data
示例#2
0
def ex2():

    tagged_brown = brown.tagged_sents(categories='news')

    results_brown = splitting(tagged_brown)
    train_brown1 = results_brown[0]
    train_brown2 = results_brown[1]
    test_brown1 = results_brown[2]
    test_brown2 = results_brown[3]

    tagged_chat = nps_chat.tagged_posts()
    results_chat = splitting(tagged_chat)
    train_chat1 = results_chat[0]
    train_chat2 = results_chat[1]
    test_chat1 = results_chat[2]
    test_chat2 = results_chat[3]

    default_tagger = nltk.DefaultTagger('NN')
    default_tagger.tag(test_brown1)
    default_tagger.tag(test_brown2)
    default_tagger.tag(test_chat1)
    default_tagger.tag(test_chat2)

    print('Test for brown corpus 1 : {}'.format(
        default_tagger.evaluate(test_brown1)))
    print('Test for brown corpus 2 : {}'.format(
        default_tagger.evaluate(test_brown2)))
    print('Test for chat corpus 1 : {}'.format(
        default_tagger.evaluate(test_chat1)))
    print('Test for chat corpus 2 : {}'.format(
        default_tagger.evaluate(test_chat2)))

    t1 = nltk.UnigramTagger(train_brown1, backoff=default_tagger)
    print(t1.evaluate(test_brown1))
    t2 = nltk.BigramTagger(train_brown1, backoff=t1)
    print(t2.evaluate(test_brown1))
    t3 = nltk.TrigramTagger(train_brown1, backoff=t2)
    print('Accuracy test brown 1: ', t3.evaluate(test_brown1))

    t1 = nltk.UnigramTagger(train_brown2, backoff=default_tagger)
    print(t1.evaluate(test_brown2))
    t2 = nltk.BigramTagger(train_brown2, backoff=t1)
    print(t2.evaluate(test_brown2))
    t3 = nltk.TrigramTagger(train_brown2, backoff=t2)
    print('Accuracy test brown 2: ', t3.evaluate(test_brown2))

    t1 = nltk.UnigramTagger(train_chat1, backoff=default_tagger)
    print(t1.evaluate(test_chat1))
    t2 = nltk.BigramTagger(train_chat1, backoff=t1)
    print(t2.evaluate(test_chat1))
    t3 = nltk.TrigramTagger(train_chat1, backoff=t2)
    print('Accuracy test chat 1: ', t3.evaluate(test_chat1))

    t1 = nltk.UnigramTagger(train_chat2, backoff=default_tagger)
    print(t1.evaluate(test_chat2))
    t2 = nltk.BigramTagger(train_chat2, backoff=t1)
    print(t2.evaluate(test_chat2))
    t3 = nltk.TrigramTagger(train_chat2, backoff=t2)
    print('Accuracy test chat 2: ', t3.evaluate(test_chat2))
示例#3
0
 'English: Brown Corpus (simplified)':
     lambda: brown.tagged_sents(simplify_tags=True),
 'English: Brown Corpus (Press, simplified)':
     lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], simplify_tags=True),
 'English: Brown Corpus (Religion, simplified)':
     lambda: brown.tagged_sents(categories='religion', simplify_tags=True),
 'English: Brown Corpus (Learned, simplified)':
     lambda: brown.tagged_sents(categories='learned', simplify_tags=True),
 'English: Brown Corpus (Science Fiction, simplified)':
     lambda: brown.tagged_sents(categories='science_fiction', simplify_tags=True),
 'English: Brown Corpus (Romance, simplified)':
     lambda: brown.tagged_sents(categories='romance', simplify_tags=True),
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', simplify_tags=True),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(simplify_tags=True),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(simplify_tags=True),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(simplify_tags=True),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(simplify_tags=True),
 'Hindi: Indian Languages Corpus':
示例#4
0
def create_tagger():
    chat_tags = nps_chat.tagged_posts()
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(chat_tags, backoff=t0)
    t2 = nltk.BigramTagger(chat_tags, backoff=t1)
    return t2
示例#5
0
import nltk
from nltk import FreqDist
from nltk.probability import ConditionalFreqDist
from nltk.corpus import brown as brown
from nltk.corpus import nps_chat as chat
from nltk import RegexpTagger
from nltk import UnigramTagger
from nltk import BigramTagger
from nltk import TrigramTagger

sizeB = len(brown.tagged_sents())  #length of size of brown corpus
sizeC = len(chat.tagged_posts())  #length of size of NPS corpus

brownTS = brown.tagged_sents()
brownTW = brown.tagged_words(
)  #partition sentences into a list with each word containing its tag

chatTP = chat.tagged_posts(
)  #partition words into a list with each post containing its tag
chatTW = chat.tagged_words(
)  #partition words into a list with each word containing its tag


def splitSen(c, p):  #function to partition corpus
    if c == "brown":
        t1 = brownTS[:int(sizeB * p)]
        t2 = brownTS[int(sizeB * p):]
        return t1, t2
    if c == "chat":
        t1 = chatTP[:int(sizeC * p)]
        t2 = chatTP[int(sizeC * p):]
示例#6
0
 'English: Brown Corpus (Press, simplified)':
 lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'],
                            simplify_tags=True),
 'English: Brown Corpus (Religion, simplified)':
 lambda: brown.tagged_sents(categories='religion', simplify_tags=True),
 'English: Brown Corpus (Learned, simplified)':
 lambda: brown.tagged_sents(categories='learned', simplify_tags=True),
 'English: Brown Corpus (Science Fiction, simplified)':
 lambda: brown.tagged_sents(categories='science_fiction',
                            simplify_tags=True),
 'English: Brown Corpus (Romance, simplified)':
 lambda: brown.tagged_sents(categories='romance', simplify_tags=True),
 'English: Brown Corpus (Humor, simplified)':
 lambda: brown.tagged_sents(categories='humor', simplify_tags=True),
 'English: NPS Chat Corpus':
 lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
 lambda: nps_chat.tagged_posts(simplify_tags=True),
 'English: Wall Street Journal Corpus':
 lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
 lambda: treebank.tagged_sents(simplify_tags=True),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
 lambda: sinica_treebank.tagged_sents(simplify_tags=True),
 'Dutch: Alpino Corpus':
 lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
 lambda: alpino.tagged_sents(simplify_tags=True),
 'Hindi: Indian Languages Corpus':
 lambda: brown.tagged_sents(tagset='simple'),
 'English: Brown Corpus (Press, simplified)':
 lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'],
                            tagset='simple'),
 'English: Brown Corpus (Religion, simplified)':
 lambda: brown.tagged_sents(categories='religion', tagset='simple'),
 'English: Brown Corpus (Learned, simplified)':
 lambda: brown.tagged_sents(categories='learned', tagset='simple'),
 'English: Brown Corpus (Science Fiction, simplified)':
 lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'),
 'English: Brown Corpus (Romance, simplified)':
 lambda: brown.tagged_sents(categories='romance', tagset='simple'),
 'English: Brown Corpus (Humor, simplified)':
 lambda: brown.tagged_sents(categories='humor', tagset='simple'),
 'English: NPS Chat Corpus':
 lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
 lambda: nps_chat.tagged_posts(tagset='simple'),
 'English: Wall Street Journal Corpus':
 lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
 lambda: treebank.tagged_sents(tagset='simple'),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
 lambda: sinica_treebank.tagged_sents(tagset='simple'),
 'Dutch: Alpino Corpus':
 lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
 lambda: alpino.tagged_sents(tagset='simple'),
 'Hindi: Indian Languages Corpus':
示例#8
0
 'English: Brown Corpus (Press, simplified)':
 lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'],
                            tagset='universal'),
 'English: Brown Corpus (Religion, simplified)':
 lambda: brown.tagged_sents(categories='religion', tagset='universal'),
 'English: Brown Corpus (Learned, simplified)':
 lambda: brown.tagged_sents(categories='learned', tagset='universal'),
 'English: Brown Corpus (Science Fiction, simplified)':
 lambda: brown.tagged_sents(categories='science_fiction',
                            tagset='universal'),
 'English: Brown Corpus (Romance, simplified)':
 lambda: brown.tagged_sents(categories='romance', tagset='universal'),
 'English: Brown Corpus (Humor, simplified)':
 lambda: brown.tagged_sents(categories='humor', tagset='universal'),
 'English: NPS Chat Corpus':
 lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
 lambda: nps_chat.tagged_posts(tagset='universal'),
 'English: Wall Street Journal Corpus':
 lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
 lambda: treebank.tagged_sents(tagset='universal'),
 'Chinese: Sinica Corpus':
 lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
 lambda: sinica_treebank.tagged_sents(tagset='universal'),
 'Dutch: Alpino Corpus':
 lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
 lambda: alpino.tagged_sents(tagset='universal'),
 'Hindi: Indian Languages Corpus':
示例#9
0
 "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(
     categories="religion", tagset="universal"
 ),
 "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(
     categories="learned", tagset="universal"
 ),
 "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
     categories="science_fiction", tagset="universal"
 ),
 "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(
     categories="romance", tagset="universal"
 ),
 "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(
     categories="humor", tagset="universal"
 ),
 "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
 "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(
     tagset="universal"
 ),
 "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
 "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(
     tagset="universal"
 ),
 "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
 "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(
     tagset="universal"
 ),
 "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
 "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(
     tagset="universal"
 ),
示例#10
0
# LookupTagger setup from NLTK Chapter 5
# For Brown Corpus

fdist_brown = nltk.FreqDist(
    brown.words()[:int((len(brown.words()) -
                        1))])  # slicing to vary the size of the dataset
cfdist_brown = nltk.ConditionalFreqDist(brown.tagged_words())
top_words_brown = fdist_brown.most_common(200)
most_likely_tags_brown = dict(
    (word, cfdist_brown[word].max()) for (word, _) in top_words_brown)
default_tagger_brown = UnigramTagger(model=most_likely_tags_brown)

splits = [[90, 10], [50, 50]]
correct_brown = brown.tagged_sents()[:int((
    len(brown.tagged_sents()) - 1))]  # slicing to vary the size of the dataset
correct_chat = chat.tagged_posts()[:int((len(chat.tagged_posts()) - 1))]

patterns = [
    (r'.*ing$', 'VBG'),  # gerunds
    (r'.*ed$', 'VBD'),  # simple past
    (r'.*es$', 'VBZ'),  # 3rd singular present
    (r'.*ould$', 'MD'),  # modals
    (r'.*\'s$', 'NN$'),  # possessive nouns
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default)
]

for split in splits:
    test_brown, train_brown = train_test_split(correct_brown,
                                               test_size=split[1] / 100,
示例#11
0
_DEFAULT = "English: Brown Corpus (Humor, simplified)"
_CORPORA = {
    "Catalan: CESS-CAT Corpus (simplified)": lambda: cess_cat.tagged_sents(tagset="simple"),
    "English: Brown Corpus": lambda: brown.tagged_sents(),
    "English: Brown Corpus (simplified)": lambda: brown.tagged_sents(tagset="simple"),
    "English: Brown Corpus (Press, simplified)": lambda: brown.tagged_sents(
        categories=["news", "editorial", "reviews"], tagset="simple"
    ),
    "English: Brown Corpus (Religion, simplified)": lambda: brown.tagged_sents(categories="religion", tagset="simple"),
    "English: Brown Corpus (Learned, simplified)": lambda: brown.tagged_sents(categories="learned", tagset="simple"),
    "English: Brown Corpus (Science Fiction, simplified)": lambda: brown.tagged_sents(
        categories="science_fiction", tagset="simple"
    ),
    "English: Brown Corpus (Romance, simplified)": lambda: brown.tagged_sents(categories="romance", tagset="simple"),
    "English: Brown Corpus (Humor, simplified)": lambda: brown.tagged_sents(categories="humor", tagset="simple"),
    "English: NPS Chat Corpus": lambda: nps_chat.tagged_posts(),
    "English: NPS Chat Corpus (simplified)": lambda: nps_chat.tagged_posts(tagset="simple"),
    "English: Wall Street Journal Corpus": lambda: treebank.tagged_sents(),
    "English: Wall Street Journal Corpus (simplified)": lambda: treebank.tagged_sents(tagset="simple"),
    "Chinese: Sinica Corpus": lambda: sinica_treebank.tagged_sents(),
    "Chinese: Sinica Corpus (simplified)": lambda: sinica_treebank.tagged_sents(tagset="simple"),
    "Dutch: Alpino Corpus": lambda: alpino.tagged_sents(),
    "Dutch: Alpino Corpus (simplified)": lambda: alpino.tagged_sents(tagset="simple"),
    "Hindi: Indian Languages Corpus": lambda: indian.tagged_sents(files="hindi.pos"),
    "Hindi: Indian Languages Corpus (simplified)": lambda: indian.tagged_sents(files="hindi.pos", tagset="simple"),
    "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.tagged_sents(),
    "Portuguese: Floresta Corpus (Portugal, simplified)": lambda: floresta.tagged_sents(tagset="simple"),
    "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.tagged_sents(),
    "Portuguese: MAC-MORPHO Corpus (Brazil, simplified)": lambda: mac_morpho.tagged_sents(tagset="simple"),
    "Spanish: CESS-ESP Corpus (simplified)": lambda: cess_esp.tagged_sents(tagset="simple"),
}
示例#12
0
import nltk
import sklearn
from nltk.corpus import brown
from nltk.corpus import nps_chat as chat
from nltk.tag import DefaultTagger, RegexpTagger, UnigramTagger, BigramTagger
from sklearn.model_selection import train_test_split

#a
splits = [[90, 10], [50, 50]]
correct_brown = brown.tagged_sents()
correct_chat = chat.tagged_posts()
default_tagger = DefaultTagger("NN")

for split in splits:  #lag til funksjon for bruk i b
    test_brown, train_brown = train_test_split(correct_brown,
                                               test_size=split[1] / 100,
                                               shuffle=False)
    test_chat, train_chat = train_test_split(correct_chat,
                                             test_size=split[1] / 100,
                                             shuffle=False)

    default_tagger.tag(train_brown)
    print(
        f"The DefaultTagger accuracy for the Brown Corpus is {default_tagger.evaluate(test_brown)} using a {split[0]}/{split[1]} split."
    )
    default_tagger.tag(train_chat)
    print(
        f"The DefaultTagger accuracy for the NPS Chat Corpus is {default_tagger.evaluate(test_chat)} using a {split[0]}/{split[1]} split.\n"
    )

    #50/50 is better because the tagger doesn't "learn", so when the test data is increased (from 10%)
from nltk.corpus import brown, nps_chat
import nltk

# Initialize all training and test data
tokens_brown = brown.sents()
tokens_nps_chat = nps_chat.posts()
tagged_sents_brown = brown.tagged_sents()
tagged_posts_nps_chat = nps_chat.tagged_posts()

size_brown_09 = int(len(tagged_sents_brown) * 0.9)
size_brown_05 = int(len(tagged_sents_brown) * 0.5)
size_nps_chat_09 = int(len(tagged_posts_nps_chat) * 0.9)
size_nps_chat_05 = int(len(tagged_posts_nps_chat) * 0.5)
train_sents_brown_09 = tagged_sents_brown[:size_brown_09]
test_sents_brown_09 = tagged_sents_brown[size_brown_09:]
train_sents_brown_05 = tagged_sents_brown[:size_brown_05]
test_sents_brown_05 = tagged_sents_brown[size_brown_05:]

train_posts_nps_chat_09 = tagged_posts_nps_chat[:size_nps_chat_09]
test_posts_nps_chat_09 = tagged_posts_nps_chat[size_nps_chat_09:]
train_posts_nps_chat_05 = tagged_posts_nps_chat[:size_nps_chat_05]
test_posts_nps_chat_05 = tagged_posts_nps_chat[size_nps_chat_05:]

# Task a)
print("Task a)")
tags_brown = [tag for word, tag in brown.tagged_words()]
tags_nps_chat = [tag for word, tag in nps_chat.tagged_words()]

# Find most common tags
max_brown = nltk.FreqDist(tags_brown).max()  # NN
max_nps_chat = nltk.FreqDist(tags_nps_chat).max()  # UH
示例#14
0
10-19-40s_686posts.xml
10-19-adults_706posts.xml
10-24-40s_706posts.xml
10-26-teens_706posts.xml
11-06-adults_706posts.xml
11-08-20s_705posts.xml
11-08-40s_706posts.xml
11-08-adults_705posts.xml
11-08-teens_706posts.xml
11-09-20s_706posts.xml
11-09-40s_706posts.xml
11-09-adults_706posts.xml
11-09-teens_706posts.xml
'''


# putting all tagged posts from the nps_chat corpus into one list
nps_chat_tagged = list()

for fileid in nps_chat.fileids():
    print fileid
    for post in nps_chat.tagged_posts(fileid):
        nps_chat_tagged.append(post)
    print str(len(nps_chat_tagged))


print nps_chat_tagged[0]
# tags can be retrieved in the same way as the Brown corpus

    
示例#15
0
 'English: Brown Corpus (simplified)':
     lambda: brown.tagged_sents(tagset='simple'),
 'English: Brown Corpus (Press, simplified)':
     lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='simple'),
 'English: Brown Corpus (Religion, simplified)':
     lambda: brown.tagged_sents(categories='religion', tagset='simple'),
 'English: Brown Corpus (Learned, simplified)':
     lambda: brown.tagged_sents(categories='learned', tagset='simple'),
 'English: Brown Corpus (Science Fiction, simplified)':
     lambda: brown.tagged_sents(categories='science_fiction', tagset='simple'),
 'English: Brown Corpus (Romance, simplified)':
     lambda: brown.tagged_sents(categories='romance', tagset='simple'),
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', tagset='simple'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(tagset='simple'),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(tagset='simple'),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(tagset='simple'),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(tagset='simple'),
 'Hindi: Indian Languages Corpus':
import nltk
from nltk.corpus import treebank
from nltk.corpus import brown
from nltk.corpus import nps_chat
from nltk.corpus import conll2000
from nltk.corpus import ConllCorpusReader

brown_fiction = list(
    brown.tagged_sents(categories='fiction', tagset='universal'))
brown_reviews = list(
    brown.tagged_sents(categories='reviews', tagset='universal'))
conll = list(conll2000.tagged_sents(tagset='universal'))
tree = list(treebank.tagged_sents(tagset='universal'))

columntypes = ['words', 'pos']
twitter_corpus = ConllCorpusReader("resources/",
                                   "twitter.conll",
                                   columntypes,
                                   tagset='en-tweet')
twitter = list(twitter_corpus.tagged_sents(tagset='universal'))

nps_raw = nps_chat.tagged_posts(tagset='universal')
nps = []
for post in nps_raw:
    post_clean = [sub for sub in post if sub[0]]
    nps.append(post_clean)
 'English: Brown Corpus (simplified)':
     lambda: brown.tagged_sents(tagset='universal'),
 'English: Brown Corpus (Press, simplified)':
     lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'),
 'English: Brown Corpus (Religion, simplified)':
     lambda: brown.tagged_sents(categories='religion', tagset='universal'),
 'English: Brown Corpus (Learned, simplified)':
     lambda: brown.tagged_sents(categories='learned', tagset='universal'),
 'English: Brown Corpus (Science Fiction, simplified)':
     lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'),
 'English: Brown Corpus (Romance, simplified)':
     lambda: brown.tagged_sents(categories='romance', tagset='universal'),
 'English: Brown Corpus (Humor, simplified)':
     lambda: brown.tagged_sents(categories='humor', tagset='universal'),
 'English: NPS Chat Corpus':
     lambda: nps_chat.tagged_posts(),
 'English: NPS Chat Corpus (simplified)':
     lambda: nps_chat.tagged_posts(tagset='universal'),
 'English: Wall Street Journal Corpus':
     lambda: treebank.tagged_sents(),
 'English: Wall Street Journal Corpus (simplified)':
     lambda: treebank.tagged_sents(tagset='universal'),
 'Chinese: Sinica Corpus':
     lambda: sinica_treebank.tagged_sents(),
 'Chinese: Sinica Corpus (simplified)':
     lambda: sinica_treebank.tagged_sents(tagset='universal'),
 'Dutch: Alpino Corpus':
     lambda: alpino.tagged_sents(),
 'Dutch: Alpino Corpus (simplified)':
     lambda: alpino.tagged_sents(tagset='universal'),
 'Hindi: Indian Languages Corpus':
示例#18
0
import nltk
from nltk.corpus import brown
from nltk import UnigramTagger
from nltk.corpus import nps_chat
from nltk import FreqDist, ConditionalFreqDist

brown_corpus_sents = [sent for sent in brown.tagged_sents()]
brown_spl_90 = int(90 * len(brown_corpus_sents) / 100)
brown_spl_50 = int(50 * len(brown_corpus_sents) / 100)
nps_chat_corpus_posts = [sent for sent in nps_chat.tagged_posts()]
nps_spl_90 = int(90 * len(nps_chat_corpus_posts) / 100)
nps_spl_50 = int(50 * len(nps_chat_corpus_posts) / 100)

train_brown_50 = brown_corpus_sents[:brown_spl_50]
test_brown_50 = brown_corpus_sents[brown_spl_50:]
train_nps_50 = nps_chat_corpus_posts[:nps_spl_50]
test_nps_50 = nps_chat_corpus_posts[nps_spl_50:]
train_brown_90 = brown_corpus_sents[:brown_spl_90]
test_brown_10 = brown_corpus_sents[brown_spl_90:]
train_nps_90 = nps_chat_corpus_posts[:nps_spl_90]
test_nps_10 = nps_chat_corpus_posts[nps_spl_90:]


def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus):
    words = [word for sent in lookup_tagger_basis for word in sent]
    fd = FreqDist(words)
    cfd = ConditionalFreqDist(corpus.tagged_words())
    most_freq_words = fd.most_common(200)
    likely_tags = dict(
        (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words)
    baseline_tagger = UnigramTagger(model=likely_tags)
示例#19
0
 "English: Brown Corpus (Press, simplified)":
 lambda: brown.tagged_sents(categories=["news", "editorial", "reviews"],
                            tagset="universal"),
 "English: Brown Corpus (Religion, simplified)":
 lambda: brown.tagged_sents(categories="religion", tagset="universal"),
 "English: Brown Corpus (Learned, simplified)":
 lambda: brown.tagged_sents(categories="learned", tagset="universal"),
 "English: Brown Corpus (Science Fiction, simplified)":
 lambda: brown.tagged_sents(categories="science_fiction",
                            tagset="universal"),
 "English: Brown Corpus (Romance, simplified)":
 lambda: brown.tagged_sents(categories="romance", tagset="universal"),
 "English: Brown Corpus (Humor, simplified)":
 lambda: brown.tagged_sents(categories="humor", tagset="universal"),
 "English: NPS Chat Corpus":
 lambda: nps_chat.tagged_posts(),
 "English: NPS Chat Corpus (simplified)":
 lambda: nps_chat.tagged_posts(tagset="universal"),
 "English: Wall Street Journal Corpus":
 lambda: treebank.tagged_sents(),
 "English: Wall Street Journal Corpus (simplified)":
 lambda: treebank.tagged_sents(tagset="universal"),
 "Chinese: Sinica Corpus":
 lambda: sinica_treebank.tagged_sents(),
 "Chinese: Sinica Corpus (simplified)":
 lambda: sinica_treebank.tagged_sents(tagset="universal"),
 "Dutch: Alpino Corpus":
 lambda: alpino.tagged_sents(),
 "Dutch: Alpino Corpus (simplified)":
 lambda: alpino.tagged_sents(tagset="universal"),
 "Hindi: Indian Languages Corpus":
]
corp_words_tagged = [
    brown.tagged_words(tagset=CONST_tagset),
    nps_chat.tagged_words(tagset=CONST_tagset),
    conll2000.tagged_words(tagset=CONST_tagset),
    treebank.tagged_words(tagset=CONST_tagset)
]
corp_words_untagged = [
    brown.words(),
    nps_chat.words(),
    conll2000.words(),
    treebank.words()
]
corp_sents_tagged = [
    brown.tagged_sents(tagset=CONST_tagset),
    nps_chat.tagged_posts(tagset=CONST_tagset),
    conll2000.tagged_sents(tagset=CONST_tagset),
    treebank.tagged_sents(tagset=CONST_tagset)
]
corp_sents_untagged = [
    brown.sents(),
    nps_chat.posts(),
    conll2000.sents(),
    treebank.sents()
]

# language tool spell checker
lt_check = language_check.LanguageTool('en-US')

# pyenchant spell checker
# pe_check = enchant.Dict('en_US')