def doc_tagging (textfile, train_test_valid): # This is solely for testing. Remove once using real corpus # stringthing = "Hello welcome to the world of to learn Categorizing and POS Tagging with NLTK and Python this should be a yeah 1992 and this a cardinal number 0.4" # text = nltk.word_tokenize(stringthing) file_content = open(textfile).read() text = nltk.word_tokenize(file_content) patterns = [ (r'^(19|20)\d\d$', 'YY'), # years (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ly$', 'RB'), # adverbs (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive # WARNING : Put the default value in the end (r'.*', 'NN') # nouns (default) ] # giving the tagger our patterns for the tags regexp_tagger = nltk.RegexpTagger(patterns) # tag our document tags = regexp_tagger.tag(text) ctr1 = 0 ctr2 = 0 ctr3 = 0 ctr4 = 0 # replace the words with their tags new_tokens = [] for word, tag in tags: if tag == "JJ": ctr1+=1 new_tokens.append("<ADJECTIVE>") elif tag == "CD": ctr2+=1 new_tokens.append("<NUMBER>") elif tag == "YY": ctr3+=1 new_tokens.append("<YEAR>") elif tag == "PRP": ctr4+=1 new_tokens.append("<PRONOUN>") else: new_tokens.append(word) if (word != '<' and word != '/s' and word != 's' and word != '@' and word != '-'): new_tokens.append(" ") #writing the tagged content to a text file filename = "brown_"+train_test_valid+".txt" fff = open(filename, "wt") for xf in new_tokens: fff.write(xf) fff.close() voc = Vocabulary(train_test_valid) for sent in new_tokens: voc.add_sentence(sent) tagged_vocab = [] for word in range(voc.num_words): tagged_vocab.append(voc.to_word(word)) print("Vocab Count for ", train_test_valid,": ", len(tagged_vocab))
# coding=UTF-8 import nltk from nltk.corpus import brown from TextSummarization.Summarizer import SummaryTool from nltk.tokenize import word_tokenize # This is our fast Part of Speech tagger ############################################################################# brown_train = brown.tagged_sents(categories=['news', 'editorial', 'reviews']) regexp_tagger = nltk.RegexpTagger([(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'(-|:|;)$', ':'), (r'\'*$', 'MD'), (r'(The|the|A|a|An|an)$', 'AT'), (r'.*able$', 'JJ'), (r'^[A-Z].*$', 'NNP'), (r'.*ness$', 'NN'), (r'.*ly$', 'RB'), (r'.*s$', 'NNS'), (r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*', 'NN')]) # from pprint import pprint unigram_tagger = nltk.UnigramTagger(brown_train, backoff=regexp_tagger) # print unigram_tagger._taggers bigram_tagger = nltk.BigramTagger(brown_train, backoff=unigram_tagger) # print bigram_tagger._context_to_tag ############################################################################# # This is our semi-CFG; Extend it according to your own needs ############################################################################# cfg = {} cfg["NNP+NNP"] = "NNP" cfg["NN+NN"] = "NNI" cfg["NNI+NN"] = "NNI" cfg["JJ+JJ"] = "JJ" cfg["JJ+NN"] = "NNI"
brown_tagged_sents = brown.tagged_sents(categories = 'news') brown_sents = brown.sents(categories = 'news') """add patterns for tagging""" patterns = [ (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # simple past (r'.*es$', 'VBZ'), # 3rd singular present (r'.*ould$', 'MD'), # modals (r'.*\'s$', 'NN$'), # possessive nouns (r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ] regexp_tagger = nltk.RegexpTagger(patterns) regexp_tagger.tag(brown_sents[3]) regexp_tagger.evaluate(brown_tagged_sents) fd = nltk.FreqDist(brown.words(categories='news')) cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news')) most_freq_words = fd.most_common(100) likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words) baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff = nltk.DefaultTagger('NN')) baseline_tagger.evaluate(brown_tagged_sents) sent = brown.sents(categories='news')[3] baseline_tagger.tag(sent) def performance(cfd, wordlist): lt = dict((word, cfd[word].max()) for word in wordlist)
esp = nltk.corpus.cess_esp.tagged_words() #size = int(len(listaOraciones) * 0.9) train_sents = listaOraciones[10] nltk.corpus.cess_esp.tagged_words() patterns = [ (r'.*o$', 'NCMS'), # Sustantivo Masculino (r'.*a$', 'NCFS'), # Sustantivo Femenino (r'.*as$', 'NCFP'), (r'.*os$', 'NCMP') ] regexp_tagger = nltk.RegexpTagger(patterns) cess_tagged_sents = nltk.corpus.cess_esp.tagged_sents() oracion = listaOraciones[10] oracionTokenizada = nltk.Text(nltk.word_tokenize(oracion)) var = regexp_tagger.tag(oracionTokenizada) """ Training nltk.UnigramTagger usando oraciones desde cess_esp """ unigram_tagger = nltk.UnigramTagger(cess_tagged_sents, backoff=nltk.RegexpTagger(patterns)) example = unigram_tagger.tag(oracionTokenizada) print(example) """print(unigram_tagger.evaluate(train_sents))"""
#### #### FUNCTION TO SAVE WORDS AND ITS TAGS IN A DICTIONARY ("results" variable) def write_words_tagged(results): file_output = open('output-test.txt', 'w') for w in results: file_output.write(w[0] + ' - ' + w[1] + '\n') file_output.close() if __name__ == '__main__': f = open('texto_test.txt', encoding='utf8') words = nltk.word_tokenize(f.read()) words = [word.lower() for word in words] #print(len(words)) #total words: 662 #fd = nltk.FreqDist(word.lower() for word in words) #fdf= fd.most_common(100) dict = load_dict() p = load_regex() rt = nltk.RegexpTagger(p) taggedText = rt.tag(words) results = save_words_tagged(taggedText) #print(save_words_tagged(taggedText)) write_words_tagged(results)
def __init__(self): self.tagger = nltk.RegexpTagger(patterns) self.chunker = nltk.RegexpParser(grammar)