def regex_tag(): raw = 'I am applying for AIT because I can be with my parents here and I am already granted a scholarship' raw_incorrect = 'I love AIT because AIT is interesting and professors here give a lot of challenging assignment' patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') ] # nouns (default) regexp_tagger = RegexpTagger(patterns) tagged = regexp_tagger.tag(word_tokenize(raw)) tagged_incorrect = regexp_tagger.tag(word_tokenize(raw_incorrect)) print(tagged) print(tagged_incorrect) score = regexp_tagger.evaluate(brown_tagged_sents) print(score)
(r'.*ould$', 'MD'), #modal (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), #plural nouns (r'^-?[0-9]+(\.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') #nouns (default) ] regexp_tagger = RegexpTagger(patterns) uniB = UnigramTagger(brownT90, backoff=defaultTB90) biB = BigramTagger(brownT90, backoff=uniB) triB = TrigramTagger(brownT90, backoff=biB) uniC = UnigramTagger(chatT50, backoff=defaultTChat50) biC = BigramTagger(chatT50, backoff=uniC) triC = TrigramTagger(chatT50, backoff=uniC) print("Regextag50/50: ", regexp_tagger.evaluate(brownT50)) print("Default: ", defaultTB90.evaluate(brownT50)) print("Bigram Brown 50/50: ", BigramTagger(brownT50, backoff=defaultTB50).evaluate(brownT50)) print("Default: ", defaultTB50.evaluate(brownT50)) print("Bigram Brown 90/10: ", BigramTagger(brownT90, backoff=defaultTB90).evaluate(brownT90)) print("Default: ", defaultTB90.evaluate(brownT90)) print("Unigram chat 50/50: ", UnigramTagger(chatT50, backoff=defaultTChat50).evaluate(chatT50)) print("Default: ", defaultTChat50.evaluate(chatT50)) print("Unigram chat 90/10: ",
# Apply the *RegexpTagger* for tagging the first 3 sentences of the brown corpus. # In[5]: regexp_tagger.tag(brown.sents()[3]) # Evaluate the tagger using category _news_ of the brown corpus. The `evaluate()`-method returns the accuracy (i.e. the rate of correct Tag-assignments) of the tagger on this test-corpus. # In[6]: brown_tagged_sents=brown.tagged_sents(categories='news') print(regexp_tagger.evaluate(brown_tagged_sents)) # ## Unigram Tagger # In[7]: from nltk import UnigramTagger, DefaultTagger, BigramTagger from nltk import FreqDist,ConditionalFreqDist # A UnigramTagger requires a tagged corpus. From the tagged corpus it learns a mapping from word to pos-tag by determining for each word the most frequent tag in the corpus. The trained tagger then assigns to each word the most frequent pos-tag as determined in the training corpus. # # In this notebook the pos-tagged Brown Corpus is applied. The tagset used in this corpus is quite sophisticated. It can be obtained by the following command:
dt.evaluate(test_data) patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) rt.evaluate(test_data) ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) ut.evaluate(test_data) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data,
(r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] additions = [ (r'[.?;!:]', '.'), ('\\($', '('), (r'.*ly$', 'ADV'), ('n[o\']t$', '*'), (r'^,$', ',') ] ret = RegexpTagger(patterns) print ret.evaluate(brown.tagged_sents(categories='news')) for pattern in additions: patterns.insert(-1, pattern) print "added pattern {}".format(pattern) ret = RegexpTagger(patterns) print ret.evaluate(brown.tagged_sents(categories='news')) # 0.203263917895 # added pattern ('[.?;!:]', '.') # 0.247538635957 # added pattern ('\\($', '(') # 0.24901048193 # added pattern ('.*ly$', 'ADV') # 0.248314338564 # added pattern ("n[o']t$", '*')