def pos_tag(pos_type, tokenized_sent): if pos_type == 'unigram': brown_train = pickle.load(open('res/brown_train.pkl', 'rb')) unigram_tagger = UnigramTagger(brown_train) return unigram_tagger.tag(tokenized_sent) elif pos_type == 'max_pos': return nltk.pos_tag(tokenized_sent)
def tag_essay(path_to_file, simplify="F"): with open(path_to_file, "r") as fin: tokenized_essay = fin.read().split() # tokenized_essay = nltk.word_tokenize(fin.read()) tokenized_essay = [ word.lower().replace(",", "").replace(".", "").replace( "?", "").replace("!", "").replace(";", "").replace(":", "").replace('"', "") for word in tokenized_essay ] tagger = UnigramTagger(brown.tagged_sents()) with open(path_to_file + ".pos", "w") as fout: tagged_words = tagger.tag(tokenized_essay) if simplify == "T": tagged_words = simplify_tagset(tagged_words) nword = 0 typos = 0 for word, tag in tagged_words: if word != "": nword += 1 if tag == "None": typos += 1 fout.write(word + ' , ' + str(tag) + "\n") with open(path_to_file + ".typos", "w") as fout: fout.write("{} , {}".format(typos, typos / nword)) print("finish {}".format(path_to_file))
def tag_words(self, words, sents): train_sents = treebank.tagged_sents() tagger = UnigramTagger(train_sents) test_sents = tagger.tag(sents[0]) # test_sents = treebank.tagged_sents()[3000:] # print treebank.tagged_sents()[1:] # print "accuracy: " + str(self._tagger.evaluate(test_sents)) # print self._tagger.tag(words) # print test_sents print tagger.evaluate(test_sents)
class FeaturesetExtractor(): def __init__(self): self.neg_words = [line.rstrip('\n') for line in open(NEG_WORD)] self.pos_words = [line.rstrip('\n') for line in open(POS_WORD)] self.anger_words = [line.rstrip('\n') for line in open(ANGER_WORD)] self.fear_words = [line.rstrip('\n') for line in open(FEAR_WORD)] self.happy_words = [line.rstrip('\n') for line in open(NEG_WORD)] self.sad_words = [line.rstrip('\n') for line in open(SAD_WORD)] self.tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) def get_featureset(self, data_element): mapFeatureset = {} size = len(data_element.clean_text) word = data_element.clean_text list_word = word.split(" ") raw = data_element.raw_text list_word_raw = raw.split(" ") tot_pos_words = len(set(list_word) & set(self.pos_words)) tot_neg_words = len(set(list_word) & set(self.neg_words)) list_anger = tuple(set(list_word) & set(self.anger_words)) list_fear = tuple(set(list_word) & set(self.fear_words)) list_happy = tuple(set(list_word) & set(self.happy_words)) list_sad = tuple(set(list_word) & set(self.sad_words)) exclamation_count = raw.count("!") question_count = raw.count("?") uppercase_count = sum(1 for c in raw if c.isupper()) mapFeatureset["bias"] = 1 mapFeatureset["word"] = tuple(list_word) mapFeatureset["neg_words"] = tot_neg_words mapFeatureset["pos_words"] = tot_pos_words mapFeatureset["exclamation_count"] = exclamation_count mapFeatureset["question_count"] = question_count mapFeatureset["list_happy"] = list_happy mapFeatureset["list_sad"] = list_sad mapFeatureset["list_fear"] = list_fear mapFeatureset["list_anger"] = list_anger pos_tag_temp = self.tagger.tag((word).split(" ")) list_pos_tag = [] for element in pos_tag_temp: list_pos_tag.append(element[1]) mapFeatureset["pos_tag"] = tuple(list_pos_tag) return mapFeatureset
def tag_penn(words): """ Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer. Parameters ---------- words: A list of strings. Returns ------- A list of tuples of (str, str) """ pt_tagger = UnigramTagger(treebank.tagged_sents()) tags = pt_tagger.tag(words) return tags
def tag_linked(words, default_tag='INFO'): """ Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers. Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger. Parameters ---------- words: A list of strings. Returns ------- A list of tuples of (str, str) :param default_tag: """ default_tagger = DefaultTagger(default_tag) pt_tagger = UnigramTagger(treebank.tagged_sents()) pt_tagger._taggers = [pt_tagger, default_tagger] tags = pt_tagger.tag(words) return tags
def get_words_simple(text_string): """ Gets a list of tagged words from an input string using whitespace-based tokenisation and a unigram PoS tagger """ # get trained Unigram tagger print('Loading unigram tagger...') train_sents = treebank.tagged_sents() unigram_tagger = UnigramTagger(train_sents) # stripping punctuation # string.translate() takes a dictionary as input. # The dictionary mapping ordinal chars to None is created in place: text_string = text_string.translate( {ord(c): None for c in CHARS_TO_DELETE}) words = text_string.split() # crude tokenisation, keeps contractions english_stops = stopwords.words('english') stops_set = set(english_stops + ADDITIONAL_STOPS) cleaned_words = [] for w in words: if w not in stops_set and w not in string.punctuation: cleaned_words.append(w) return unigram_tagger.tag(cleaned_words)
class PyTenseShift(object): """Initialization of PyTenseShift objects. The important part when you use the PlPyTenseShift is that we allow you to implmenent your own Tagger to optimize your results in translating from present to past tense. So, you need to implement the taggerinterface and change the second line of this code """ def __init__(self, corpus, isPl): if isPl: self.tagger = FirstTagger(corpus) else: dtag = DefaultTagger("NN") self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag) """ Tokenize the input sentence into words. This kind of representation is better to evaluate. """ def _tokenize(self, tense, isPl): if isPl: return self.tagger.tag(tense) else: return self.__utag.tag(tokenize(tense)) def getPastTense(self, tense): """Translates sentence given in present tense into past tense Args: sentence (str): Sentence to translate Returns: str. Sentence in past tense """ raise NotImplementedError("abstract method")
import nltk from nltk.corpus import brown from nltk.tag import UnigramTagger tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700]) sentence = ['John', 'and', 'Smith', 'went', 'to', 'NY', 'and', 'Germany'] for word, tag in tagger.tag(sentence): print(word, '->', tag)
class FirstTagger(TaggerInterface): def __init__(self, corpus): dtag = DefaultTagger("NN") self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag) def tag(self, tense): """Does translation from tag generated by tagger into unified format Args: sentence: list of touple (word and its form) which are after verb Returns: list of touple (word and its form in unified format) """ words = self.__utag.tag(tokenize(tense)) for i, (word, form) in enumerate(words): word_info = {} if form[0] == 'V': word_info['klasa'] = 'czasownik' elif form[0] == 'S': word_info['klasa'] = 'rzeczownik' elif form[0] == 'A': word_info['klasa'] = 'przymiotnik' elif form[0] == 'N': word_info['klasa'] = 'liczebnik' elif form[0] == 'Z': word_info['klasa'] = 'zaimek' elif form[0] == 'D': word_info['klasa'] = 'przysłówek' elif form[0] == 'P': word_info['klasa'] = 'przyimek' elif form[0] == 'C': word_info['klasa'] = 'spójnik' elif form[0] == 'I': word_info['klasa'] = 'wykrzyknik' elif form[0] == 'T': word_info['klasa'] = 'partykuła' else: word_info['klasa'] = 'nieznany' if form[1] == 'S': word_info['liczba'] = 'pojedyńcza' elif form[1] == 'P': word_info['liczba'] = 'mnoga' if(len(form) >= 3): if form[2] == 'N': word_info['przypadek'] = 'mianownik' elif form[2] == 'G': word_info['przypadek'] = 'dopełniacz' elif form[2] == 'D': word_info['przypadek'] = 'celownik' elif form[2] == 'A': word_info['przypadek'] = 'biernik' elif form[2] == 'I': word_info['przypadek'] = 'narzędnik' elif form[2] == 'L': word_info['przypadek'] = 'miejscownik' elif form[2] == 'V': word_info['przypadek'] = 'wołacz' if(len(form) >= 4): if form[3] == 'M': word_info['rodzaj'] = 'm' elif form[3] == 'P': word_info['rodzaj'] = 'm' elif form[3] == 'A': word_info['rodzaj'] = 'm' elif form[3] == 'I': word_info['rodzaj'] = 'm' elif form[3] == 'F': word_info['rodzaj'] = 'ż' elif form[3] == 'N': word_info['rodzaj'] = 'n' elif form[3] == 'O': word_info['rodzaj'] = 'm' elif form[3] == 'R': word_info['rodzaj'] = 'ż' elif form[3] == 'T': word_info['rodzaj'] = 'ż' if(len(form) >= 6): if form[5] == '1': word_info['osoba'] = 'pierwsza' elif form[5] == '2': word_info['osoba'] = 'druga' elif form[5] == '3': word_info['osoba'] = 'trzecia' elif form[5] == 'I': word_info['osoba'] = 'bezokolicznik' elif form[5] == 'B': word_info['osoba'] = 'bezosobnik' elif form[5] == 'U': word_info['osoba'] = 'imiesłów' elif form[5] == 'W': word_info['osoba'] = 'imiesłów' if(len(form) >= 7): if form[6] == 'T': word_info['czas'] = 'teraźniejszy' elif form[6] == 'P': word_info['czas'] = 'przeszły' elif form[6] == 'F': word_info['czas'] = 'przyszły' if(len(form) >= 8): if form[7] == 'O': word_info['tryb'] = 'oznajmujący' elif form[7] == 'P': word_info['tryb'] = 'przypuszczający' elif form[7] == 'R': word_info['tryb'] = 'rozkazujący' if(len(form) >= 9): if form[8] == 'D': word_info['aspekt'] = 'dokonane' elif form[8] == 'N': word_info['aspekt'] = 'niedokonane' words[i] = (words[i][0], word_info) return words
#tagger.batch_tag([['Hello', 'world', '.'], ['How', 'are', 'you', '?']]) #-------------------------------------------------------------------------------- # Training taggers #-------------------------------------------------------------------------------- # so far so good. Next have to train taggers. # Unigram, training on Treebank corpus from nltk.tag import UnigramTagger from nltk.corpus import treebank train_sents = treebank.tagged_sents()[:3000] unigram_tagger = UnigramTagger(train_sents) # try it on our word_list. unigram_tagger.tag( word_list ) # Backoff taggers - hierarchy of taggers, first tags all it can, then next takes # a stab at all with tag of None, then next, etc. # Unigram with Default as backoff: train_sents = treebank.tagged_sents() unigram_tagger = UnigramTagger( train_sents, backoff = default_tagger ) # Add in contextual taggers: # - bigram - current word plus previous token. # - trigram - current word plus previous two tokens. from nltk.tag import BigramTagger, TrigramTagger bitagger = BigramTagger( train_sents ) tritagger = TrigramTagger( train_sents )
def TaggerOnline(tokens): etiq1 = DefaultTagger('N') sentencas_treinadoras = mac_morpho.tagged_sents()[::] etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1) tagsTokens = etiq2.tag(tokens) return tagsTokens
rt.tag(nltk.word_tokenize(sentence)) #%% ## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # testing performance of unigram tagger print('unigram tagger: ') print(ut.evaluate(test_data)) print(ut.tag(nltk.word_tokenize(sentence))) # testing performance of bigram tagger print('\nbigram tagger:') print(bt.evaluate(test_data)) print(bt.tag(nltk.word_tokenize(sentence))) # testing performance of trigram tagger print('\ntrigram tagger:') print(tt.evaluate(test_data)) print(tt.tag(nltk.word_tokenize(sentence))) #%% # combined tagger with a list of taggers and use a backoff tagger def combined_tagger(train_data, taggers, backoff=None):
import nltk from nltk.corpus import treebank from nltk.tag import UnigramTagger unitag = UnigramTagger(model={'Vinken': 'NN'}) print(unitag.tag(treebank.sents()[0]))
for word in cleaned_bow.columns: pos = pos_tag(list(word))[0][1] if pos != 'NN': nouns_bow = nouns_bow.drop(word, axis=1) topic_words(NMF_vars(10, nouns_bow)[0], nouns_bow) topic_words( NMF_vars(5, nouns_bow.drop('girl', axis=1))[0], nouns_bow.drop('girl', axis=1)) # try different tagger nouns_bow_2 = cleaned_bow tagger = UnigramTagger(brown.tagged_sents()) for word in cleaned_bow.columns: pos = tagger.tag(list(word))[0][1] if pos != 'NN': nouns_bow_2 = nouns_bow_2.drop(word, axis=1) for num in range(2, 6): topic_words(NMF_vars(num, nouns_bow_2)[0], nouns_bow_2) topic_words(NMF_vars(10, nouns_bow_2)[0], nouns_bow_2) # remove the word 'total' topic_words( NMF_vars(10, nouns_bow_2.drop('total', axis=1))[0], nouns_bow_2.drop('total', axis=1)) for num in range(12, 16): topic_words(NMF_vars(num, nouns_bow_2)[0], nouns_bow_2)
def main(): brownDict={'.': 'sentence closer (. ; ? *)', '(': 'left paren', ')': 'right paren', '*': 'not, nt', '--': 'dash', ',': 'comma', ':': 'colon', 'ABL': 'pre-qualifier (quite, rather)', 'ABN': 'pre-quantifier (half, all)', 'ABX': 'pre-quantifier (both)', 'AP': 'post-determiner (many, several, next)', 'AT': 'article (a, the, no)', 'BE': 'be', 'BED': 'were', 'BEDZ': 'was', 'BEG': 'being', 'BEM': 'am', 'BEN': 'been', 'BER': 'are, art', 'BEZ': 'is', 'CC': 'coordinating conjunction (and, or)', 'CD': 'cardinal numeral (one, two, 2, etc.)', 'CS': 'subordinating conjunction (if, although)', 'DO': 'do', 'DOD': 'did', 'DOZ': 'does', 'DT': 'singular determiner/quantifier (this, that)', 'DTI': 'singular or plural determiner/quantifier (some, any)', 'DTS': 'plural determiner (these, those)', 'DTX': 'determiner/double conjunction (either)', 'EX': 'existential there', 'FW': 'foreign word (hyphenated before regular tag)', 'HV': 'have', 'HVD': 'had (past tense)', 'HVG': 'having', 'HVN': 'had (past participle)', 'IN': 'preposition', 'JJ': 'adjective', 'JJR': 'comparative adjective', 'JJS': 'semantically superlative adjective (chief, top)', 'JJT': 'morphologically superlative adjective (biggest)', 'MD': 'modal auxiliary (can, should, will)', 'NC': 'cited word (hyphenated after regular tag)', 'NN': 'singular or mass noun', 'NN$': 'possessive singular noun', 'NNS': 'plural noun', 'NNS$': 'possessive plural noun', 'NP': 'proper noun or part of name phrase', 'NP$': 'possessive proper noun', 'NPS': 'plural proper noun', 'NPS$': 'possessive plural proper noun', 'NR': 'adverbial noun (home, today, west)', 'OD': 'ordinal numeral (first, 2nd)', 'PN': 'nominal pronoun (everybody, nothing)', 'PN$': 'possessive nominal pronoun', 'PP$': 'possessive personal pronoun (my, our)', 'PP$$': 'second (nominal) possessive pronoun (mine, ours)', 'PPL': 'singular reflexive/intensive personal pronoun (myself)', 'PPLS': 'plural reflexive/intensive personal pronoun (ourselves)', 'PPO': 'objective personal pronoun (me, him, it, them)', 'PPS': '3rd. singular nominative pronoun (he, she, it, one)', 'PPSS': 'other nominative personal pronoun (I, we, they, you)', 'PRP': 'Personal pronoun', 'PRP$': 'Possessive pronoun', 'QL': 'qualifier (very, fairly)', 'QLP': 'post-qualifier (enough, indeed)', 'RB': 'adverb', 'RBR': 'comparative adverb', 'RBT': 'superlative adverb', 'RN': 'nominal adverb (here, then, indoors)', 'RP': 'adverb/particle (about, off, up)', 'TO': 'infinitive marker to', 'UH': 'interjection, exclamation', 'VB': 'verb, base form', 'VBD': 'verb, past tense', 'VBG': 'verb, present participle/gerund', 'VBN': 'verb, past participle', 'VBP': 'verb, non 3rd person, singular, present', 'VBZ': 'verb, 3rd. singular present', 'WDT': 'wh- determiner (what, which)', 'WP$': 'possessive wh- pronoun (whose)', 'WPO': 'objective wh- pronoun (whom, which, that)', 'WPS': 'nominative wh- pronoun (who, which, that)', 'WQL': 'wh- qualifier (how)', 'WRB': 'wh- adverb (how, where, when)'} sent = ['Marley', 'was', 'dead', ':', 'to', 'begin', 'with', '.', 'There', 'is', 'no', 'doubt', 'whatever', 'about', 'that', '.'] # Part 1 print("Brown tagger:") brownTagger = UnigramTagger(brown.tagged_sents()) for word, tag in brownTagger.tag(sent): print(word,'->',tag) print("\nPENN Treebank Tagger:") pennTagger = UnigramTagger(treebank.tagged_sents()) for word, tag in pennTagger.tag(sent): print(word, '->', tag) print("\nNLTK tagger:") nltkTagger = pos_tag(sent) for word, tag in nltkTagger: print(word, '->', tag) # Part 2 br_tw = brown.tagged_words(categories='mystery') br_ts = brown.tagged_sents(categories='mystery') print("\nAnswer to 2A: \nWords: {} \nSentences: {}".format(len(br_tw), len(br_ts))) print("\nAnswer to 2B: \n100th word: {}, type is: {} \n101th word: {}, type is: {}".format(br_tw[99][0], brownDict.get(br_tw[99][1]), br_tw[100][0], brownDict.get(br_tw[100][1]))) tagList=[] wordList=[] sentDict={} for sent in br_ts: for wordtag in sent: wordList.append(wordtag[0]) tagList.append(brownDict.get(wordtag[1])) sentDict.setdefault(wordtag[1], []) sentDict[wordtag[1]].append(wordtag[0]) print("\nAnswer to 2C: There are {} different tags being used.\n2D: 10 most common words are: \n{} \n2E: 10 most common tags are: \n {}".format(len(Counter(tagList)),Counter(wordList).most_common(10),Counter(tagList).most_common(10))) print("\nAnswer to 2F: Most common adverb (RB)= {} \n2G: Most common adjective (JJ)= {}".format(Counter(sentDict["RB"]).most_common(1),format(Counter(sentDict["JJ"]).most_common(1)))) concDict={} tagTypes = [] i=0 tagTypesMeaning = [] for word, tag in br_tw: concDict[i]=[i,str(tag),str(word)] i+=1 if word == 'so': tagTypes.append(tag) tagTypesMeaning.append(brownDict.get(tag)) tagTypesFreq = nltk.FreqDist(tagTypesMeaning) print("\nAnswer to 2H and 2I:\n{}".format(tagTypesFreq.most_common())) # 2k vanaf hier met CS QL RB: csplusList=[] csminList=[] qlplusList=[] qlminList=[] rbplusList=[] rbminList=[] csneighbourrightList=[] csneighbourleftList=[] qlneighbourrightList=[] qlneighbourleftList=[] rbneighbourrightList=[] rbneighbourleftList=[] for value in concDict.values(): if value[2] == 'so' and value[1] == 'CS': csplusList.append(concDict.get(value[0]+1)) csminList.append(concDict.get(value[0]-1)) elif value[2] == 'so' and value[1] == 'QL': qlplusList.append(concDict.get(value[0]+1)) qlminList.append(concDict.get(value[0]-1)) elif value[2] == 'so' and value[1] == 'RB': rbplusList.append(concDict.get(value[0]+1)) rbminList.append(concDict.get(value[0]-1)) for item in csminList: csneighbourleftList.append(item[1]) for item in csplusList: csneighbourrightList.append(item[1]) for item in qlminList: qlneighbourleftList.append(item[1]) for item in qlplusList: qlneighbourrightList.append(item[1]) for item in rbminList: rbneighbourleftList.append(item[1]) for item in rbplusList: rbneighbourrightList.append(item[1]) uniqueList = [] [uniqueList.append(tag) for tag in tagTypes if tag not in uniqueList] exampleList = [] for sentence in br_ts: for word, tag in sentence: if word == 'so' and tag in uniqueList: sentenceStr = " ".join([w for w, t in sentence]) + "(" + brownDict.get(tag) + ")" exampleList.append(sentenceStr) uniqueList.remove(tag) print("\nAnswer to 2J:\n{}".format(exampleList)) print("\nAnswer to 2K: \n Preceder of CS:{} \n Follower of CS: {}\n Preceder of QL: {}\n Follower of QL: {}\n Preceder of RB: {}\n Follower of RB: {}".format(brownDict.get(Counter(csneighbourleftList).most_common(1)[0][0]),brownDict.get(Counter(csneighbourrightList).most_common(1)[0][0]), brownDict.get(Counter(qlneighbourleftList).most_common(1)[0][0]), brownDict.get(Counter(qlneighbourrightList).most_common(1)[0][0]), brownDict.get(Counter(rbneighbourleftList).most_common(1)[0][0]), brownDict.get(Counter(rbneighbourrightList).most_common(1)[0][0]))) # Part 3 text = open('holmes.txt').read()[:500] tokens = nltk.wordpunct_tokenize(text) textTagged = pos_tag(tokens) print("\nPart 3, holmes.txt tokenized and POS-tagged:\n{}".format(textTagged))
text += wikipedia.page(door).content sents = text.split('\n') # Each sentence is a list of words def preprocess(sents): sents = [sent.translate(str.maketrans('','', string.punctuation)).strip(string.digits).lower() for sent in sents] sents = [word_tokenize(sent) for sent in sents] return [[word for word in sent if word not in set(stopwords.words('english'))] for sent in sents] # We will tag all the words with unique POS tags and later use these as labels for our classification task tagger = UnigramTagger(brown.tagged_sents(categories='news')) sents = preprocess(sents) words = list(set([word for sent in sents for word in sent])) pos_tags = dict(tagger.tag(words)) maxlengths=max([len(s) for s in sents]) # In[3]: words_dic={word:i+1 for (i,word) in enumerate(words)} words_tags_pairs=[tagger.tag(s) for s in sents] y=[[w[1] for w in wp] for wp in words_tags_pairs] X=[[words_dic[w[0]] for w in wp] for wp in words_tags_pairs] X=[[x+[0]*(maxlengths-len(x))] for x in X] all_tags=set([x for s in y for x in s]) all_tags_dic={t:i for (t,i) in zip (all_tags,range(1,len(all_tags)+1))} all_tags_dic["eos"]=0
#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import unicode_literals, division from nltk.tag import UnigramTagger if __name__ == '__main__': model = {u'Péter': 'N', 'Enikő': 'N', 'szeret': 'V', 'Marit': 'Nacc'} tagger = UnigramTagger(model=model) print(tagger.tag(['Péter', 'Enikő', 'szeret', 'Marit']))
from nltk.tag import UnigramTagger from nltk.corpus import treebank # train train_sents = treebank.tagged_sents()[:3000] tagger = UnigramTagger(train_sents) print(treebank.sents()[0]) print(tagger.tag(treebank.sents()[0])) # test test_sents = treebank.tagged_sents()[3000:] print(tagger.evaluate(test_sents))
poslist.append('positive') #Creates a list of tuples, with sentiment tagged. postagged = zip(postxt, poslist) negtagged = zip(negtxt, neglist) #Combines all of the tagged tweets to one large list. taggedtweets = postagged + negtagged shuffle(taggedtweets) tweets = [] #Create a list of words in the tweet, within a tuple. for (word, sentiment) in taggedtweets: word_filter=tokenizer.tokenize(word) word_filter=AntonymReplacer().replace_negations(word_filter) dic=dict(tagger.tag(word_filter)) word_lemma=[] for i in word_filter: if dic[i]==None: pass elif dic[i][0]=="V": word_lemma.append(lemmatizer.lemmatize(i, "v").lower()) elif dic[i][0]=="N" or dic[i][0]=="ADJ" or dic[i][0]=="ADV": word_lemma.append(lemmatizer.lemmatize(i).lower()) tweets.append((word_lemma, sentiment)) #Pull out all of the words in a list of tagged tweets, formatted in tuples. def getwords(tweets):
# print [word for word in words if word not in english_stops] #look up words and print synset from nltk.corpus import wordnet syn = wordnet.synsets('cookbook')[0] print syn.name() print syn.definition() print syn.hypernyms() print syn.hypernyms()[0].hyponyms() print syn.root_hypernyms() print syn.hypernym_paths() # # for w in words: # print w # syn = wordnet.synsets(w) # if (type(syn) == 'list'): # syn = syn[0] # # print syn # if (len(syn) != 0): # for i in syn: # # print i # # print '\t[', i.name(),']' # print '\t--', i.definition() from nltk.tag import UnigramTagger from nltk.corpus import treebank train_sents = treebank.tagged_sents()[:3000] tagger = UnigramTagger(train_sents) print tagger.tag(treebank.sents()[0])
######## UNIGRAM TAGGER ########## from nltk.tag import UnigramTagger from nltk.corpus import treebank #We use the first 3000 sentences of the treebank corpus as the training set to initialize #the UnigramTagger class #Unigram tagger can be trained by giving it a list of tagged sentences at initialization. train_sents=treebank.tagged_sents()[:3000] tagger=UnigramTagger(train_sents) print treebank.sents()[0] print tagger.tag(treebank.sents()[0]) test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents) tagger=UnigramTagger(model={'Pierre':'NN'}) tagger.tag(treebank.sents())[0]
from nltk.corpus import brown from nltk.tag import UnigramTagger import cPickle as pickle INPUT_FILE = "/dfs/scratch0/googlengrams/2012-eng-fic/info/commonnonstop-1900-2000-8-6.pkl" def write_word_list(filename, word_list): out_fp = open(filename, "w") print >> out_fp, "\n".join(word_list) if __name__ == '__main__': in_fp = open(INPUT_FILE, "rb") words = pickle.load(in_fp) tagger = UnigramTagger(brown.tagged_sents()) good_words = [] for word in words: tag = tagger.tag([word])[0][1] if tag == None: continue if "NP" in tag: continue good_words.append(word) write_word_list("brown.txt", good_words)
import nltk from nltk.corpus import brown from nltk.tag import UnigramTagger tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700]) sentence = ['John','and','Smith','went','to','NY','and','Germany'] for word, tag in tagger.tag(sentence): print(word,'->',tag)
while not redisInterface.hasPending(): sleep(1) page = redisInterface.popPending() print 'Reading ' + page + ' STARTED' # Read the html page with open(page, 'r') as htmlPage: data = htmlPage.read().replace('\n', ''); # Parse html soup = BeautifulSoup(data) articleTitle = titleFromArticleSoup(soup) articleBodyWithTags = soup.find_all('p', class_ = 'story-body-text') articleBody = [stringFromHTMLParagraph(p) for p in articleBodyWithTags] parasToProcess = [articleTitle] + articleBody print 'Title: ' + articleTitle # Tokenize and tag tokens = [tokenizer.tokenize(s) for s in parasToProcess] taggedArticleBody = [tagger.tag(t) for t in tokens] # Save to redis redisInterface.saveArticleData( TaggedArticle(articleTitle, taggedArticleBody,'article_data')) print 'Reading ' + page + ' FINISHED'
return myPhrases #Cria o etiquetador padrão para que palavras não conhecidas sejam tratadas com substantivo(N) etiqPadrao = DefaultTagger('N') #Pega o trainning set a partir das tagged_sents() do mac_morpho sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000] #Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao) coment = str(input("Entre com o texto: ")) if coment == "default": coment = open("default.txt", "r").read().replace("\n", " ") #O texto é convertido em tokens tokens=nltk.word_tokenize(coment.lower()) #É etiquetada cada token do texto tags = etiq.tag(tokens) #É criado o analisador de expresões regulares contendo os padrões procurados analiseGramatical = RegexpParser(r""" PADRAO7: {<N><ADJ>} PADRAO1: {<ADJ><N>(<PREP>?<N>)*} PADRAO2: {<ADV><ADV>?<ADJ>(<N>(<PREP>?<N>)*)?} PADRAO3: {<N>(<PREP>?<N>)*(<ADJ>)<ADV><ADV>?} PADRAO4: {<N>(<PREP>?<N>)*<ADV>?<ADJ>+} PADRAO5: {<ADV><V>} PADRAO6: {<V><ADV>} """) #O analisador é então utilizado para a geração da árvore de padrões arvore = analiseGramatical.parse(tags) x = [ExtractPhrases(arvore, "PADRAO1"), ExtractPhrases(arvore, "PADRAO2"), ExtractPhrases(arvore, "PADRAO3"), ExtractPhrases(arvore, "PADRAO4"),
import nltk import json from nltk.corpus import brown from nltk.tag import UnigramTagger tagger = UnigramTagger(brown.tagged_sents(tagset='universal')) sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'] for word, tag in tagger.tag(sent): if tag == "VERB": print(word, '->', tag) verbs_tagged = open("../assets/inputText/verbs_tagged_questions.txt", 'w+') with open("../assets/inputText/all_questions.txt", 'r') as all_lines: for line in all_lines: splitLine = line.split(' ') for word, tag in tagger.tag(splitLine): if tag == "VERB": verbs_tagged.write(word + "\n") #verbs_tagged.write(word + " \"" + line[:-1] + "\"\n")
print rt.evaluate(test_data) print rt.tag(tokens) ## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data) print ut.tag(tokens) print bt.evaluate(test_data) print bt.tag(tokens) print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff) return backoff ct = combined_tagger(train_data=train_data, taggers=[UnigramTagger, BigramTagger, TrigramTagger], backoff=rt)
import nltk from nltk.tag import UnigramTagger from nltk.corpus import treebank training= treebank.tagged_sents()[:7000] unitagger=UnigramTagger(training) print(treebank.sents()[0]) print(unitagger.tag(treebank.sents()[0]))
# Contiguous sequences of n items from a sequence of text or speech. Items can be words, phonemes, # letters, characters or syllabes. Shingles: n-grams where items are just words. # UnigramTagger -> NGramTagger -> ContextTagger -> SequentialBackoffTagger # Train the N-Gram taggers using the training_data (pre-tagged tokens, i.e. labeled observations) ut = UnigramTagger(train=train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) # Test the performance of each N-Gram tagger print("1-Gram Tagger Accuracy: {}".format(ut.evaluate(test_data))) print("2-Gram Tagger Accuracy: {}".format(bt.evaluate(test_data))) print("3-Gram Tagger Accuracy: {}".format(tt.evaluate(test_data))) print("\n1-Gram tags:") print(ut.tag(tokens)) print("\n2-Gram tags:") print(bt.tag(tokens)) print("\n3-Gram tags:") print(tt.tag(tokens)) # Note that the best accuracy is provided by the 1-Gram tagger, as it isn't always the case that the same bigrams # and trigrams observed in the training data will be present in the same way in the testing data (e.g. pairs of words # do not always appear paired in the same way) # 4. TAGGER CHAINING WITH BACKOFF TAGGERS: # Function to chain a set of taggers, with a backoff tagger as last resource
''' Created on Feb 24, 2012 @author: 100457636 ''' from nltk.corpus import brown from nltk.tag import UnigramTagger train_sents = brown.tagged_sents() tagger = UnigramTagger(train_sents) print tagger.tag(["doing"])
from samples import sample # Test and training variables test_sents = treebank.tagged_sents()[3000:] train_sents = treebank.tagged_sents()[:3000] tk_sample = word_tokenize(sample) # Default tagger - Nouns df_tagger = DefaultTagger('NN') tagged = df_tagger.tag(tk_sample) accuracy = df_tagger.evaluate(test_sents) print(f"Tagged text: {tagged}; acc = {accuracy}\n") # Unigram tagger ug_tagger = UnigramTagger(train_sents) tagged = ug_tagger.tag(tk_sample) accuracy = ug_tagger.evaluate(test_sents) print(f"Tagged text: {tagged}; acc = {accuracy}\n") # Backoff tagger: rely on other tagger(backoff) when the current one does not know how to evaluate ugb_tagger = UnigramTagger(train_sents, backoff=df_tagger) accuracy = ugb_tagger.evaluate(test_sents) print(f"Accuracy of backoff: {accuracy}\n") # Saving pickle and testing it. with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'wb') as file: pickle.dump(ugb_tagger, file) with open('pickles/pos-taggers/unigram_backoff_tagger.pickle', 'rb') as file: pk_tagger = pickle.load(file)
tnt_tagger = tnt.TnT(N=100) tnt_tagger.train(brown_train_sents) bigramTagger = BigramTagger(brown_train_sents) trigramTagger = TrigramTagger(brown_train_sents) print("------------Recommended Tagger------------") print(nltk.pos_tag(sent)) print("------------Default Tagger------------") print(defaultTagger.tag(sent)) print("------------Unigram Tagger Overrode------------") unigramTagger = UnigramTagger(model={'Pierre': 'NN'}) print(unigramTagger.tag(sent)) print("------------Unigram Tagger Trained------------") unigramTagger = UnigramTagger(brown_train_sents) print(unigramTagger.tag(sent)) #cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger print("------------Unigram Tagger Trained with cutoff=3------------") unigramTagger = UnigramTagger(brown_train_sents, cutoff=3) print(unigramTagger.tag(sent)) print("------------Bigram Tagger------------") print(bigramTagger.tag(sent)) print("------------Trigram Tagger------------") print(trigramTagger.tag(sent))