def trainUniTnT(self): """train unigram and tnt seperatly without DefaultTagger""" self.split_into_folds() for k in range(1, (self.folds + 1)): train_sents = sum(self.foldlist[: (self.folds - 1)], []) tnt_tagger = tnt.TnT(N=100) tnt_tagger.train(train_sents) print(str(k) + " fold: tnt evaluated") unigram = UnigramTagger(train_sents) print(str(k) + " fold: unigram evaluated") to_tag = [untag(i) for i in self.foldlist[self.folds - 1]] self.tnt_tagged += tnt_tagger.tag_sents(to_tag) self.uni_tagged += unigram.tag_sents(to_tag) self.org_tagged += self.foldlist[self.folds - 1] self.foldlist = [self.foldlist[self.folds - 1]] + self.foldlist[: (self.folds - 1)] self.tnt = tnt_tagger self.unigram = unigram self.tnt_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.tnt_tagged, [])) self.uni_avg_acc = accuracy(sum(self.org_tagged, []), sum(self.uni_tagged, [])) print("Accuracy of concatenated tnt-tagged sentences: ", self.tnt_avg_acc) print("Accuracy of concatenated unigram-tagged sentences: ", self.uni_avg_acc) (self.tnt_tagprecision, self.tnt_tagrecall) = self.tagprecision_recall( tnt_tagger, self.tnt_tagged, self.org_tagged ) (self.unigram_tagprecision, self.unigram_tagrecall) = self.tagprecision_recall( unigram, self.uni_tagged, self.org_tagged ) # delete following values so that trainRegexp has the inicial values self.org_tagged = [] self.foldlist = [] for i in range(1, self.folds + 1): self.foldlist.append(self.create_fold(i))
def pos_tag(pos_type, tokenized_sent): if pos_type == 'unigram': brown_train = pickle.load(open('res/brown_train.pkl', 'rb')) unigram_tagger = UnigramTagger(brown_train) return unigram_tagger.tag(tokenized_sent) elif pos_type == 'max_pos': return nltk.pos_tag(tokenized_sent)
def tag_unigrams_by_topic(self, dict_of_sentences_by_topic): tagged_unigrams_by_topic = {} train_sents = mac_morpho.tagged_sents()[:5000] tagger = UnigramTagger(train_sents) for k, v in dict_of_sentences_by_topic.items(): tagged_unigrams_by_topic[k] = tagger.batch_tag(dict_of_sentences_by_topic[k]) return tagged_unigrams_by_topic
def tag_words(self, words, sents): train_sents = treebank.tagged_sents() tagger = UnigramTagger(train_sents) test_sents = tagger.tag(sents[0]) # test_sents = treebank.tagged_sents()[3000:] # print treebank.tagged_sents()[1:] # print "accuracy: " + str(self._tagger.evaluate(test_sents)) # print self._tagger.tag(words) # print test_sents print tagger.evaluate(test_sents)
def baseline(tagged_sentences): from nltk.tag import UnigramTagger from nltk.tag import DefaultTagger from collections import Counter # lowercase everything # remove all instances of non-universal tags for propper comparison with # the other methods new_tagged_sentences = [] for sent in tagged_sentences: sent = [(x[0].lower(), x[1]) for x in sent] sent = [x for x in sent if x[1] in _UNI] new_tagged_sentences.append(sent) tagged_sentences = new_tagged_sentences # size of corpus corpus_size = sum([len(sent) for sent in tagged_sentences]) print('Corpus size: {} docs'.format(len(tagged_sentences))) print('Corpus size: {} tokens'.format(corpus_size)) # train/test split test_pct = 0.3 test_len = int(len(tagged_sentences) * test_pct) test_idx = len(tagged_sentences) - test_len train_set = tagged_sentences[:test_idx] test_set = tagged_sentences[test_idx:] print('Train set: {} docs'.format(len(train_set))) print('Test set: {} docs'.format(len(test_set))) # calculate test set size in tokens test_size = sum([len(sent) for sent in test_set]) print('Test set: {} tokens'.format(test_size)) # calculate most comman tag in the train set # this should be 'NOUN' tag_dist = [] for sent in train_set: tag_dist += [x[1] for x in sent] counts = Counter() counts.update(tag_dist) most_common = counts.most_common(1)[0][0] print('Most common tag: {}'.format(most_common)) # Create model backoff = DefaultTagger(most_common) tagger = UnigramTagger(train=train_set, backoff=backoff, cutoff=5) # Evaluate acc = tagger.evaluate(test_set) print('Baseline: {}'.format(acc))
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [ (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN'), # nouns (default) ] ) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant')], backoff=trigram_tagger, ) return main_tagger
def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [ (r"^-?[0-9]+(.[0-9]+)?$", "CD"), # cardinal numbers (r"(The|the|A|a|An|an)$", "AT"), # articles (r".*able$", "JJ"), # adjectives (r".*ness$", "NN"), # nouns formed from adjectives (r".*ly$", "RB"), # adverbs (r".*s$", "NNS"), # plural nouns (r".*ing$", "VBG"), # gerunds (r".*ed$", "VBD"), # past tense verbs (r".*", "NN"), # nouns (default) ] ) brown_train = brown.tagged_sents(categories="news") unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) # Override particular words main_tagger = RegexpTagger( [(r"(A|a|An|an)$", "ex_quant"), (r"(Every|every|All|all)$", "univ_quant")], backoff=trigram_tagger, ) return main_tagger
def getUnigramTaggerAccuracy(trainingSet, testingSet): # trains and returns the accuracy of the UnigramTagger # get untagged sentences and gold POS tags testingUntaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet] testingGoldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet] # train tagger unigramTagger = UnigramTagger(trainingSet) # test tagger and get predicted POS tags unigramTaggedSentences = unigramTagger.tag_sents(testingUntaggedSentences) unigramTaggedSentencesPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in unigramTaggedSentences] # calculate and return accuracy return calculateAccuracy(testingGoldPOSTags, unigramTaggedSentencesPOSTags)
def tag_penn(words): """ Tokenizes text by using a Penn Treebank tagged sentence and word tokenizer. Parameters ---------- words: A list of strings. Returns ------- A list of tuples of (str, str) """ pt_tagger = UnigramTagger(treebank.tagged_sents()) tags = pt_tagger.tag(words) return tags
def unigram_bigram_tagger(train_sentences): return BigramTagger( train_sentences, backoff=UnigramTagger( train_sentences, backoff=DefaultTagger("NN") ) )
def contextual_rules(wikicorpus_dir, context_file): sentences = wikicorpus(wikicorpus_dir, words=1000000) ANONYMOUS = "anonymous" for s in sentences: for i, (w, tag) in enumerate(s): if tag == "NP": # NP = proper noun in Parole tagset. s[i] = (ANONYMOUS, "NP") ctx = fntbl37() tagger = UnigramTagger(sentences) tagger = BrillTaggerTrainer(tagger, ctx, trace=0) tagger = tagger.train(sentences, max_rules=100) #print tagger.evaluate(wikicorpus(10000, start=1)) with open(context_file, "w") as f: for rule in tagger.rules(): f.write("%s\n" % rule)
def test_pos_template(self): train_sents = treebank.tagged_sents()[:1000] tagger = UnigramTagger(train_sents) trainer = brill_trainer.BrillTaggerTrainer( tagger, [brill.Template(brill.Pos([-1]))]) brill_tagger = trainer.train(train_sents) # Example from https://github.com/nltk/nltk/issues/769 result = brill_tagger.tag('This is a foo bar sentence'.split()) expected = [('This', 'DT'), ('is', 'VBZ'), ('a', 'DT'), ('foo', None), ('bar', 'NN'), ('sentence', None)] self.assertEqual(result, expected)
def train_tagger(tagger_name): train_sents = treebank.tagged_sents()[:5000] if tagger_name == "TnT" or tagger_name == 'tagger': trained_tagger = tnt.TnT() trained_tagger.train(train_sents) else: tagger1 = DefaultTagger('NN') tagger2 = TrigramTagger(train_sents, backoff=tagger1) tagger3 = BigramTagger(train_sents, backoff=tagger2) trained_tagger = UnigramTagger(train_sents, backoff=tagger3) return trained_tagger
def getUnigramTaggerAccuracy(trainingSet, testingSet): # trains and returns the accuracy of the UnigramTagger # get untagged sentences and gold POS tags testingUntaggedSentences = [[taggedWord[0] for taggedWord in sentence] for sentence in testingSet] testingGoldPOSTags = [[taggedWord[1] for taggedWord in sentence] for sentence in testingSet] # train tagger unigramTagger = UnigramTagger(trainingSet) # test tagger and get predicted POS tags unigramTaggedSentences = unigramTagger.tag_sents(testingUntaggedSentences) unigramTaggedSentencesPOSTags = [[ taggedWord[1] for taggedWord in sentence ] for sentence in unigramTaggedSentences] # calculate and return accuracy return calculateAccuracy(testingGoldPOSTags, unigramTaggedSentencesPOSTags)
def make_pos_model(model_type): now = time.time() reader = TaggedCorpusReader('.', 'greek_training_set.pos') train_sents = reader.tagged_sents() if model_type == 'unigram': tagger = UnigramTagger(train_sents) file = 'unigram.pickle' elif model_type == 'bigram': tagger = BigramTagger(train_sents) file = 'bigram.pickle' elif model_type == 'trigram': tagger = TrigramTagger(train_sents) file = 'trigram.pickle' elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) file = '123grambackoff.pickle' elif model_type == 'tnt': tagger = tnt.TnT() tagger.train(train_sents) file = 'tnt.pickle' else: print('Invalid model_type.') _dir = os.path.expanduser('~/greek_models_cltk/taggers/pos') path = os.path.join(_dir, file) with open(path, 'wb') as f: pickle.dump(tagger, f) print('Completed training {0} model in {1} seconds to {2}.'.format(model_type, time.time() - now, path))
def transform(self, reviews, y=None): number_of_adjectives = [] training_corpus = alp.tagged_sents() unitagger = UnigramTagger(training_corpus) pos_tag = unitagger.tag for review in reviews: tokens = re.findall(r"[\w']+|[.,!?;]", review) adj = 0 for token in pos_tag(tokens): if token[1] == 'adj': adj += 1 number_of_adjectives.append([adj]) return number_of_adjectives
def ngram_tagger(tagged_sents): patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'), (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'), (r'.*', 'MORA_HAUPT')] # default regex_tagger = nltk.RegexpTagger(patterns) tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger) # cutoff = 3, if necessary tagger2 = BigramTagger(tagged_sents, backoff=tagger1) tagger3 = TrigramTagger(tagged_sents, backoff=tagger2) return tagger3
def tag_linked(words, default_tag='INFO'): """ Tokenizes text by using a Penn Treebank tagged sentence and word tokenizers. Uses DefaultTagger to assign "default_tag" to any element missed by Penn Treebank tagger. Parameters ---------- words: A list of strings. Returns ------- A list of tuples of (str, str) :param default_tag: """ default_tagger = DefaultTagger(default_tag) pt_tagger = UnigramTagger(treebank.tagged_sents()) pt_tagger._taggers = [pt_tagger, default_tagger] tags = pt_tagger.tag(words) return tags
def get_words_simple(text_string): """ Gets a list of tagged words from an input string using whitespace-based tokenisation and a unigram PoS tagger """ # get trained Unigram tagger print('Loading unigram tagger...') train_sents = treebank.tagged_sents() unigram_tagger = UnigramTagger(train_sents) # stripping punctuation # string.translate() takes a dictionary as input. # The dictionary mapping ordinal chars to None is created in place: text_string = text_string.translate( {ord(c): None for c in CHARS_TO_DELETE}) words = text_string.split() # crude tokenisation, keeps contractions english_stops = stopwords.words('english') stops_set = set(english_stops + ADDITIONAL_STOPS) cleaned_words = [] for w in words: if w not in stops_set and w not in string.punctuation: cleaned_words.append(w) return unigram_tagger.tag(cleaned_words)
class FeaturesetExtractor(): def __init__(self): self.neg_words = [line.rstrip('\n') for line in open(NEG_WORD)] self.pos_words = [line.rstrip('\n') for line in open(POS_WORD)] self.anger_words = [line.rstrip('\n') for line in open(ANGER_WORD)] self.fear_words = [line.rstrip('\n') for line in open(FEAR_WORD)] self.happy_words = [line.rstrip('\n') for line in open(NEG_WORD)] self.sad_words = [line.rstrip('\n') for line in open(SAD_WORD)] self.tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) def get_featureset(self, data_element): mapFeatureset = {} size = len(data_element.clean_text) word = data_element.clean_text list_word = word.split(" ") raw = data_element.raw_text list_word_raw = raw.split(" ") tot_pos_words = len(set(list_word) & set(self.pos_words)) tot_neg_words = len(set(list_word) & set(self.neg_words)) list_anger = tuple(set(list_word) & set(self.anger_words)) list_fear = tuple(set(list_word) & set(self.fear_words)) list_happy = tuple(set(list_word) & set(self.happy_words)) list_sad = tuple(set(list_word) & set(self.sad_words)) exclamation_count = raw.count("!") question_count = raw.count("?") uppercase_count = sum(1 for c in raw if c.isupper()) mapFeatureset["bias"] = 1 mapFeatureset["word"] = tuple(list_word) mapFeatureset["neg_words"] = tot_neg_words mapFeatureset["pos_words"] = tot_pos_words mapFeatureset["exclamation_count"] = exclamation_count mapFeatureset["question_count"] = question_count mapFeatureset["list_happy"] = list_happy mapFeatureset["list_sad"] = list_sad mapFeatureset["list_fear"] = list_fear mapFeatureset["list_anger"] = list_anger pos_tag_temp = self.tagger.tag((word).split(" ")) list_pos_tag = [] for element in pos_tag_temp: list_pos_tag.append(element[1]) mapFeatureset["pos_tag"] = tuple(list_pos_tag) return mapFeatureset
def test_ngram_taggers(self): unitagger = UnigramTagger(self.corpus, backoff=self.default_tagger) bitagger = BigramTagger(self.corpus, backoff=unitagger) tritagger = TrigramTagger(self.corpus, backoff=bitagger) ntagger = NgramTagger(4, self.corpus, backoff=tritagger) encoded = self.encoder.encode(ntagger) decoded = self.decoder.decode(encoded) self.assertEqual(repr(ntagger), repr(decoded)) self.assertEqual(repr(tritagger), repr(decoded.backoff)) self.assertEqual(repr(bitagger), repr(decoded.backoff.backoff)) self.assertEqual(repr(unitagger), repr(decoded.backoff.backoff.backoff)) self.assertEqual(repr(self.default_tagger), repr(decoded.backoff.backoff.backoff.backoff))
def train_evaluate_brills(train_data, test_data): """Training and evaluating of Brill`s tagger""" # Define templates for rules, provided by nltk brill.Template._cleartemplates() templates = brill.fntbl37() # Define initial tagger, tagging by the most common tag initial_tagger = UnigramTagger(train_data) trainer = brill_trainer.BrillTaggerTrainer( initial_tagger=initial_tagger, # better unk words handling templates=templates, trace=3, deterministic=True) tagger = trainer.train(train_data, max_rules=100) # max number of rules to learn 100 print("Accuracy:", tagger.evaluate(test_data)) return tagger.evaluate(test_data)
def test_pos_template(self): train_sents = treebank.tagged_sents()[:1000] tagger = UnigramTagger(train_sents) trainer = brill_trainer.BrillTaggerTrainer( tagger, [brill.Template(brill.Pos([-1]))]) brill_tagger = trainer.train(train_sents) # Example from https://github.com/nltk/nltk/issues/769 result = brill_tagger.tag("This is a foo bar sentence".split()) expected = [ ("This", "DT"), ("is", "VBZ"), ("a", "DT"), ("foo", None), ("bar", "NN"), ("sentence", None), ] self.assertEqual(result, expected)
def create_a_dict_model_for_test_accuracy(self, tagged_unigrams_by_topic): pre_model = { k: map(dict, v) for k, v in tagged_unigrams_by_topic.items() } for k, v in pre_model.items(): reference_model_by_topic = {} for i in v: reference_model_by_topic.update(i) pre_model[k] = reference_model_by_topic dict_model_by_topic = pre_model test_sents = mac_morpho.tagged_sents()[:5000] tagger_accuracy_by_topic = {} for k, v in pre_model.items(): tagger_accuracy_by_topic[k] = UnigramTagger( model=pre_model[k]).evaluate(test_sents) return dict_model_by_topic, tagger_accuracy_by_topic
def lexical(tokens): print "\n" print "Step 2: Lexical Analysis\n" print "Essentially refers to dictionary and obtains the properties of the word" print "Part-Of-Speech tagging" print "The tagset is:\n" tag = DefaultTagger('NN') tagg = UnigramTagger(train_sent, backoff=tag) tagger = BigramTagger(train_sent, backoff=tagg) tagtokens = tagger.tag(tokens) for token, tag in tagtokens: print token + "->" + tag print "\n" print "The acurracy of the trained pos tagger is:" print tagger.evaluate(test_sents) return tagtokens
def wordTagger(self, wordlist,number): train_sents = treebank.tagged_sents()[:3000] if number==1: taglist = nltk.pos_tag(wordlist) elif number ==2: tagger = DefaultTagger('NN') taglist = tagger.tag(wordlist) elif number ==3: tagger = UnigramTagger(train_sents) taglist = tagger.tag(wordlist) elif number ==4: tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) taglist = tnt_tagger.tag(wordlist) elif number ==5: tagger = ClassifierBasedPOSTagger(train=train_sents) taglist = tagger.tag(wordlist) return taglist
def train_brill_tagger(tagged_sents): # The brill tagger module in NLTK. Template._cleartemplates() templates = brill24() # or fntbl37 # default_tagger = nltk.DefaultTagger('MORA_HAUPT') patterns = [(r'''(b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)e (b|c|d|f|g|h|j|k|l|m|n||p|q|r|s|t|v|w|x|z)''', 'MORA'), (r'.*(a|e|i|o|u|ä|î|ô|ü)(a|e|i|o|u|ä|î|ô|ü)', 'DOPPEL'), (r'.*', 'MORA_HAUPT')] # default regex_tagger = nltk.RegexpTagger(patterns) tagger1 = UnigramTagger(tagged_sents, backoff=regex_tagger) # cutoff = 3, if necessary tagger2 = BigramTagger(tagged_sents, backoff=tagger1) tagger3 = TrigramTagger(tagged_sents, backoff=tagger2) tagger4 = brill_trainer.BrillTaggerTrainer(tagger3, templates, trace=3) tagger5 = tagger4.train(tagged_sents, max_rules=200) print return tagger5
def __init__(self, rooms): self.__sentences = list(brown.tagged_sents(categories=['adventure'])) for room in rooms: """ This will only really work with UnigramTagger, since there's no context """ self.__sentences.append([(name, 'NN') for name in room.names]) self.__sentences.append([(noun, 'NN') for noun in room.noun_to_item.keys()]) for item in room: for i in range(1000): # TODO: Weight less hackily self.__sentences.append([ (verb, 'VB') for verb in item.verb_to_action.keys() ]) self.tokenize = word_tokenize self.__tagger = UnigramTagger(train=self.__sentences) self.tag = self.__tagger.tag self.tokens = [] self.tagged = []
class PyTenseShift(object): """Initialization of PyTenseShift objects. The important part when you use the PlPyTenseShift is that we allow you to implmenent your own Tagger to optimize your results in translating from present to past tense. So, you need to implement the taggerinterface and change the second line of this code """ def __init__(self, corpus, isPl): if isPl: self.tagger = FirstTagger(corpus) else: dtag = DefaultTagger("NN") self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag) """ Tokenize the input sentence into words. This kind of representation is better to evaluate. """ def _tokenize(self, tense, isPl): if isPl: return self.tagger.tag(tense) else: return self.__utag.tag(tokenize(tense)) def getPastTense(self, tense): """Translates sentence given in present tense into past tense Args: sentence (str): Sentence to translate Returns: str. Sentence in past tense """ raise NotImplementedError("abstract method")
def train_tagger(): ''' Um exemplo de treinamento de um etiquetador sintático usando um modelo de tri-gramas baseado em probabilidades. Um etiquetador sintático identifica quais a classe de uma palavra Ex.: Isso é um teste = Isso-PROSUB é-V um-ART teste-N Preposição Verbo Artigo Substantivo ''' # Carregando um conjunto de dados em português que possui # sentenças manualmente identificadas data = [ [(w, re.split('[|-]', tag)[0]) for w, tag in sent] for sent in mac_morpho.tagged_sents()] # Classe sintática padrão. N siginifica Nome/substantivo tagger0 = DefaultTagger('N') print('train unigram') tagger1 = UnigramTagger(data, backoff=tagger0) print('training bigram') tagger2 = BigramTagger(data, backoff=tagger1) print('training trigram') return TrigramTagger(data, backoff=tagger2)
def Tagger(): #Tagger etiq1 = DefaultTagger('N') sentencas_treinadoras = mac_morpho.tagged_sents()[::] etiq2 = UnigramTagger(sentencas_treinadoras, backoff=etiq1) return etiq2
def stringFromHTMLParagraph(paraWithTags): paraString = '' for taggedString in paraWithTags.strings: paraString += removeApostrophe(taggedString.string) return paraString def titleFromArticleSoup(soup): titleDiv = soup.find(class_ = 'story-heading') if not titleDiv: titleDiv = soup.find(class_ = 'entry-title') return unicode(removeApostrophe(titleDiv.string)) # Set up the tokenizer and the tagger tokenizer = RegexpTokenizer(r'\w+') tagger = UnigramTagger(treebank.tagged_sents()) # Open up a redis connection redisInterface = RedisInterface() # Print status print 'Reader ONLINE' # Run the wait-execute loop while True: while not redisInterface.hasPending(): sleep(1) page = redisInterface.popPending() print 'Reading ' + page + ' STARTED'
from nltk.corpus import brown from nltk.tag import UnigramTagger import cPickle as pickle INPUT_FILE = "/dfs/scratch0/googlengrams/2012-eng-fic/info/commonnonstop-1900-2000-8-6.pkl" def write_word_list(filename, word_list): out_fp = open(filename, "w") print >> out_fp, "\n".join(word_list) if __name__ == '__main__': in_fp = open(INPUT_FILE, "rb") words = pickle.load(in_fp) tagger = UnigramTagger(brown.tagged_sents()) good_words = [] for word in words: tag = tagger.tag([word])[0][1] if tag == None: continue if "NP" in tag: continue good_words.append(word) write_word_list("brown.txt", good_words)
import nltk from nltk.tag import UnigramTagger from nltk.corpus import treebank training= treebank.tagged_sents()[:7000] unitagger=UnigramTagger(training) print(treebank.sents()[0]) print(unitagger.tag(treebank.sents()[0]))
one_hot_multi.fit_transform(tagged_tweets) # 查看特征名 one_hot_multi.classes_ # 查看特征名 from nltk.corpus import brown from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger # 从布朗语料库中获取文本数据,切分为句子 sentences = brown.tagged_sents(categories='news') # 将4000个句子用作训练,623个句子用作测试 train = sentences[:4000] test = sentences[4000:] # 创建回退标注器 unigram = UnigramTagger(train) bigram = BigramTagger(train, backoff=unigram) trigram = TrigramTagger(train, backoff=bigram) # 查看准确率 trigram.evaluate(test) # TF-IDF import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer # 创建文本 text_data = np.array( ['I love Brazil. Brazil!', 'Sweden is best', 'Germany beats both']) # 创建TF-IDF特征矩阵 tfidf = TfidfVectorizer() feature_matrix = tfidf.fit_transform(text_data) # 查看TF-IDF特征矩阵
def __init__(self, corpus): dtag = DefaultTagger("NN") self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)
import nltk from nltk.tag import UnigramTagger from nltk.tag import DefaultTagger from nltk.corpus import treebank testing = treebank.tagged_sents()[2000:] training= treebank.tagged_sents()[:7000] tag1=DefaultTagger('NN') tag2=UnigramTagger(training,backoff=tag1) print(tag2.evaluate(testing))
brown_tagged_sents = brown.tagged_sents(categories='news') #print(brown_tagged_sents) # [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')], ...] default_tagger = nltk.DefaultTagger('NN') print(default_tagger.evaluate(brown_tagged_sents)) # 0.13089484257215028 brown_tagged_sents2 = [[('The', 'AT'), ('Fulton', 'NP-TL'), ('manner', 'NN')]] print(default_tagger.evaluate(brown_tagged_sents2)) # 0.3333333333333333 train_data = brown_tagged_sents[:int(len(brown_tagged_sents) * 0.9)] test_data = brown_tagged_sents[int(len(brown_tagged_sents) * 0.9):] unigram_tagger = UnigramTagger(train_data, backoff=default_tagger) print(unigram_tagger.evaluate(test_data)) # 0.835841722316356 bigram_tagger = BigramTagger(train_data, backoff=unigram_tagger) print(bigram_tagger.evaluate(test_data)) # 0.8454101465164956 trigram_tagger = TrigramTagger(train_data, backoff=bigram_tagger) print(trigram_tagger.evaluate(test_data)) # 0.8427190272102063 regexp_tagger = RegexpTagger( [( r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ( r'(The|the|A|a|An|an)$', 'AT'), # articles ( r'.*able$', 'JJ'), # adjectives
airportcode[row['airpt_cd']] = 'AIRPORT' airportcode[row['airpt_name']] = 'AIRPORT' airportcode[row['city_cd']] = 'CITY' for row in db.codeshare_words.find(): codeshareModel[row['word'].strip()] = 'CODESHARE' TO_MODEL = {} TO_MODEL['-'] = 'TO' TO_MODEL['至'] = 'TO' # ABOVE THREE CAN BE COLLAPSED INTO A GENERIC DATABASE TABLE FOR UNIGRAM TAGGERS CURRENCY = set(currencycode.keys()) known_tourcodes = {tc: 'TC' for tc in TOURCODES} # setup tourcodes model train_sents = treebank.tagged_sents()[:3000] unigramtagger = UnigramTagger(train_sents, backoff=backoff) currencytagger = UnigramTagger(model=currencycode, backoff=unigramtagger) # tag currency airporttagger = UnigramTagger(model=airportcode, backoff=currencytagger) # tag airports codesharetagger = UnigramTagger(model=codeshareModel, backoff=airporttagger) # tag codeshare carriertagger = UnigramTagger(model=CARRIER_MODEL, backoff=codesharetagger) # tag carriers datetagger = UnigramTagger(model=monthModel, backoff=carriertagger) # tag months rtagger = RegexpTagger(patterns, backoff=datetagger) known_tourcodes_tagger = UnigramTagger(model=known_tourcodes, backoff=rtagger) dashtagger = UnigramTagger(model=TO_MODEL, backoff=known_tourcodes_tagger)
#!/usr/bin/python # -*- coding: utf-8 -*- from __future__ import unicode_literals, division from nltk.tag import UnigramTagger if __name__ == '__main__': model = {u'Péter': 'N', 'Enikő': 'N', 'szeret': 'V', 'Marit': 'Nacc'} tagger = UnigramTagger(model=model) print(tagger.tag(['Péter', 'Enikő', 'szeret', 'Marit']))
import nltk from nltk.corpus import treebank from nltk.tag import UnigramTagger training= treebank.tagged_sents()[:7000] unitagger=UnigramTagger(training) testing = treebank.tagged_sents()[2000:] print(unitagger.evaluate(testing))
import nltk from nltk.corpus import brown from nltk.tag import UnigramTagger tagger = UnigramTagger(brown.tagged_sents(categories='news')[:700]) sentence = ['John','and','Smith','went','to','NY','and','Germany'] for word, tag in tagger.tag(sentence): print(word,'->',tag)
class FirstTagger(TaggerInterface): def __init__(self, corpus): dtag = DefaultTagger("NN") self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag) def tag(self, tense): """Does translation from tag generated by tagger into unified format Args: sentence: list of touple (word and its form) which are after verb Returns: list of touple (word and its form in unified format) """ words = self.__utag.tag(tokenize(tense)) for i, (word, form) in enumerate(words): word_info = {} if form[0] == 'V': word_info['klasa'] = 'czasownik' elif form[0] == 'S': word_info['klasa'] = 'rzeczownik' elif form[0] == 'A': word_info['klasa'] = 'przymiotnik' elif form[0] == 'N': word_info['klasa'] = 'liczebnik' elif form[0] == 'Z': word_info['klasa'] = 'zaimek' elif form[0] == 'D': word_info['klasa'] = 'przysłówek' elif form[0] == 'P': word_info['klasa'] = 'przyimek' elif form[0] == 'C': word_info['klasa'] = 'spójnik' elif form[0] == 'I': word_info['klasa'] = 'wykrzyknik' elif form[0] == 'T': word_info['klasa'] = 'partykuła' else: word_info['klasa'] = 'nieznany' if form[1] == 'S': word_info['liczba'] = 'pojedyńcza' elif form[1] == 'P': word_info['liczba'] = 'mnoga' if(len(form) >= 3): if form[2] == 'N': word_info['przypadek'] = 'mianownik' elif form[2] == 'G': word_info['przypadek'] = 'dopełniacz' elif form[2] == 'D': word_info['przypadek'] = 'celownik' elif form[2] == 'A': word_info['przypadek'] = 'biernik' elif form[2] == 'I': word_info['przypadek'] = 'narzędnik' elif form[2] == 'L': word_info['przypadek'] = 'miejscownik' elif form[2] == 'V': word_info['przypadek'] = 'wołacz' if(len(form) >= 4): if form[3] == 'M': word_info['rodzaj'] = 'm' elif form[3] == 'P': word_info['rodzaj'] = 'm' elif form[3] == 'A': word_info['rodzaj'] = 'm' elif form[3] == 'I': word_info['rodzaj'] = 'm' elif form[3] == 'F': word_info['rodzaj'] = 'ż' elif form[3] == 'N': word_info['rodzaj'] = 'n' elif form[3] == 'O': word_info['rodzaj'] = 'm' elif form[3] == 'R': word_info['rodzaj'] = 'ż' elif form[3] == 'T': word_info['rodzaj'] = 'ż' if(len(form) >= 6): if form[5] == '1': word_info['osoba'] = 'pierwsza' elif form[5] == '2': word_info['osoba'] = 'druga' elif form[5] == '3': word_info['osoba'] = 'trzecia' elif form[5] == 'I': word_info['osoba'] = 'bezokolicznik' elif form[5] == 'B': word_info['osoba'] = 'bezosobnik' elif form[5] == 'U': word_info['osoba'] = 'imiesłów' elif form[5] == 'W': word_info['osoba'] = 'imiesłów' if(len(form) >= 7): if form[6] == 'T': word_info['czas'] = 'teraźniejszy' elif form[6] == 'P': word_info['czas'] = 'przeszły' elif form[6] == 'F': word_info['czas'] = 'przyszły' if(len(form) >= 8): if form[7] == 'O': word_info['tryb'] = 'oznajmujący' elif form[7] == 'P': word_info['tryb'] = 'przypuszczający' elif form[7] == 'R': word_info['tryb'] = 'rozkazujący' if(len(form) >= 9): if form[8] == 'D': word_info['aspekt'] = 'dokonane' elif form[8] == 'N': word_info['aspekt'] = 'niedokonane' words[i] = (words[i][0], word_info) return words
def cltk_pos_cv(full_training_set, local_dir_rel): print("full_training_set", full_training_set) unigram_accuracies = [] bigram_accuracies = [] trigram_accuracies = [] backoff_accuracies = [] tnt_accuracies = [] with open(full_training_set) as f: training_set_string = f.read() pos_set = training_set_string.split('\n\n') # mk into a list sentence_count = len(pos_set) # 3473 tenth = math.ceil(int(sentence_count) / int(10)) random.seed(0) random.shuffle(pos_set) def chunks(l, n): """Yield successive n-sized chunks from l. http://stackoverflow.com/a/312464 """ for i in range(0, len(l), n): yield l[i:i+n] # a list of 10 lists ten_parts = list(chunks(pos_set, tenth)) # a list of 10 lists with ~347 sentences each #for counter in list(range(10)): for counter, part in list(enumerate(ten_parts)): # map test list to part of given loop test_set = ten_parts[counter] # or: test_set = part # filter out this loop's test index training_set_lists = [x for x in ten_parts if x is not ten_parts[counter]] # next concatenate the list together into 1 file ( http://stackoverflow.com/a/952952 ) training_set = [item for sublist in training_set_lists for item in sublist] # save shuffled tests to file (as NLTK trainers expect) #local_dir_rel = '~/cltk_data/user_data' local_dir = os.path.expanduser(local_dir_rel) if not os.path.isdir(local_dir): os.makedirs(local_dir) test_path = os.path.join(local_dir, 'test.pos') with open(test_path, 'w') as f: f.write('\n\n'.join(test_set)) train_path = os.path.join(local_dir, 'train.pos') with open(train_path, 'w') as f: f.write('\n\n'.join(training_set)) # read POS corpora print("local_dir", local_dir) train_reader = TaggedCorpusReader(local_dir, 'train.pos') train_sents = train_reader.tagged_sents() test_reader = TaggedCorpusReader(local_dir, 'test.pos') test_sents = test_reader.tagged_sents() print('Loop #' + str(counter)) # make unigram tagger unigram_tagger = UnigramTagger(train_sents) # evaluate unigram tagger unigram_accuracy = None unigram_accuracy = unigram_tagger.evaluate(test_sents) unigram_accuracies.append(unigram_accuracy) print('Unigram:', unigram_accuracy) # make bigram tagger bigram_tagger = BigramTagger(train_sents) # evaluate bigram tagger bigram_accuracy = None bigram_accuracy = bigram_tagger.evaluate(test_sents) bigram_accuracies.append(bigram_accuracy) print('Bigram:', bigram_accuracy) # make trigram tagger trigram_tagger = TrigramTagger(train_sents) # evaluate trigram tagger trigram_accuracy = None trigram_accuracy = trigram_tagger.evaluate(test_sents) trigram_accuracies.append(trigram_accuracy) print('Trigram:', trigram_accuracy) # make 1, 2, 3-gram backoff tagger tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger3 = TrigramTagger(train_sents, backoff=tagger2) # evaluate trigram tagger backoff_accuracy = None backoff_accuracy = tagger3.evaluate(test_sents) backoff_accuracies.append(backoff_accuracy) print('1, 2, 3-gram backoff:', backoff_accuracy) # make tnt tagger tnt_tagger = tnt.TnT() tnt_tagger.train(train_sents) # evaulate tnt tagger tnt_accuracy = None tnt_accuracy = tnt_tagger.evaluate(test_sents) tnt_accuracies.append(tnt_accuracy) print('TnT:', tnt_accuracy) final_accuracies_list = [] mean_accuracy_unigram = mean(unigram_accuracies) standard_deviation_unigram = stdev(unigram_accuracies) uni = {'unigram': {'mean': mean_accuracy_unigram, 'sd': standard_deviation_unigram}} final_accuracies_list.append(uni) mean_accuracy_bigram = mean(bigram_accuracies) standard_deviation_bigram = stdev(bigram_accuracies) bi = {'bigram': {'mean': mean_accuracy_bigram, 'sd': standard_deviation_bigram}} final_accuracies_list.append(bi) mean_accuracy_trigram = mean(trigram_accuracies) standard_deviation_trigram = stdev(trigram_accuracies) tri = {'trigram': {'mean': mean_accuracy_trigram, 'sd': standard_deviation_trigram}} final_accuracies_list.append(tri) mean_accuracy_backoff = mean(backoff_accuracies) standard_deviation_backoff = stdev(backoff_accuracies) back = {'1, 2, 3-gram backoff': {'mean': mean_accuracy_backoff, 'sd': standard_deviation_backoff}} final_accuracies_list.append(back) mean_accuracy_tnt = mean(tnt_accuracies) standard_deviation_tnt = stdev(tnt_accuracies) tnt_score = {'tnt': {'mean': mean_accuracy_tnt, 'sd': standard_deviation_tnt}} final_accuracies_list.append(tnt_score) final_dict = {} for x in final_accuracies_list: final_dict.update(x) return final_dict
from nltk.tag import UnigramTagger, DefaultTagger from nltk.corpus import treebank from tag_util import train_sents, test_sents # train default_tagger = DefaultTagger('NN') tagger = UnigramTagger(train_sents, backoff=default_tagger) # test print(tagger.evaluate(test_sents)) # save to pickle import pickle with open('unitagger.pkl', 'wb') as output: pickle.dump(tagger, output) # load from pickle with open('unitagger.pkl', 'rb') as data_file: tagger2 = pickle.load(data_file) print(tagger2.evaluate(test_sents)) # or nltk.data.load('unitagger.pkl') to load
######## UNIGRAM TAGGER ########## from nltk.tag import UnigramTagger from nltk.corpus import treebank #We use the first 3000 sentences of the treebank corpus as the training set to initialize #the UnigramTagger class #Unigram tagger can be trained by giving it a list of tagged sentences at initialization. train_sents=treebank.tagged_sents()[:3000] tagger=UnigramTagger(train_sents) print treebank.sents()[0] print tagger.tag(treebank.sents()[0]) test_sents=treebank.tagged_sents()[3000:] print tagger.evaluate(test_sents) tagger=UnigramTagger(model={'Pierre':'NN'}) tagger.tag(treebank.sents())[0]
def postag( templates=None, tagged_data=None, num_sents=1000, max_rules=300, min_score=3, min_acc=None, train=0.8, trace=3, randomize=False, ruleformat="str", incremental_stats=False, template_stats=False, error_output=None, serialize_output=None, learning_curve_output=None, learning_curve_take=300, baseline_backoff_tagger=None, separate_baseline_data=False, cache_baseline_tagger=None): """ Brill Tagger Demonstration :param templates: how many sentences of training and testing data to use :type templates: list of Template :param tagged_data: maximum number of rule instances to create :type tagged_data: C{int} :param num_sents: how many sentences of training and testing data to use :type num_sents: C{int} :param max_rules: maximum number of rule instances to create :type max_rules: C{int} :param min_score: the minimum score for a rule in order for it to be considered :type min_score: C{int} :param min_acc: the minimum score for a rule in order for it to be considered :type min_acc: C{float} :param train: the fraction of the the corpus to be used for training (1=all) :type train: C{float} :param trace: the level of diagnostic tracing output to produce (0-4) :type trace: C{int} :param randomize: whether the training data should be a random subset of the corpus :type randomize: C{bool} :param ruleformat: rule output format, one of "str", "repr", "verbose" :type ruleformat: C{str} :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) :type incremental_stats: C{bool} :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing :type template_stats: C{bool} :param error_output: the file where errors will be saved :type error_output: C{string} :param serialize_output: the file where the learned tbl tagger will be saved :type serialize_output: C{string} :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) :type learning_curve_output: C{string} :param learning_curve_take: how many rules plotted :type learning_curve_take: C{int} :param baseline_backoff_tagger: the file where rules will be saved :type baseline_backoff_tagger: tagger :param separate_baseline_data: use a fraction of the training data exclusively for training baseline :type separate_baseline_data: C{bool} :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get deterministic output from the baseline unigram tagger between python versions) :type cache_baseline_tagger: C{string} Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This is fast and fine for a demo, but is likely to generalize worse on unseen data. Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). """ # defaults baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER if templates is None: from nltk.tag.brill import describe_template_sets, brill24 # some pre-built template sets taken from typical systems or publications are # available. Print a list with describe_template_sets() # for instance: templates = brill24() (training_data, baseline_data, gold_data, testing_data) = \ _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data) # creating (or reloading from cache) a baseline tagger (unigram tagger) # this is just a mechanism for getting deterministic output from the baseline between # python versions if cache_baseline_tagger: if not os.path.exists(cache_baseline_tagger): baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) with open(cache_baseline_tagger, 'w') as print_rules: pickle.dump(baseline_tagger, print_rules) print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger)) with open(cache_baseline_tagger, "r") as print_rules: baseline_tagger= pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger)) else: baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) print("Trained baseline tagger") if gold_data: print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data))) # creating a Brill tagger tbrill = time.time() trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat) print("Training tbl tagger...") brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill)) if gold_data: print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) # printing the learned rules, if learned silently if trace == 1: print("\nLearned rules: ") for (ruleno, rule) in enumerate(brill_tagger.rules(),1): print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat))) # printing template statistics (optionally including comparison with the training data) # note: if not separate_baseline_data, then baseline accuracy will be artificially high if incremental_stats: print("Incrementally tagging the test data, collecting individual rule statistics") (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data) print(" Rule statistics collected") if not separate_baseline_data: print("WARNING: train_stats asked for separate_baseline_data=True; the baseline " "will be artificially high") trainstats = brill_tagger.train_stats() if template_stats: brill_tagger.print_template_statistics(teststats) if learning_curve_output: _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take) print("Wrote plot of learning curve to {0}".format(learning_curve_output)) else: print("Tagging the test data") taggedtest = brill_tagger.tag_sents(testing_data) if template_stats: brill_tagger.print_template_statistics() # writing error analysis to file if error_output is not None: with open(error_output, 'w') as f: f.write('Errors for Brill Tagger %r\n\n' % serialize_output) f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n') print("Wrote tagger errors including context to {0}".format(error_output)) # serializing the tagger to a pickle file and reloading (just to see it works) if serialize_output is not None: taggedtest = brill_tagger.tag_sents(testing_data) with open(serialize_output, 'w') as print_rules: pickle.dump(brill_tagger, print_rules) print("Wrote pickled tagger to {0}".format(serialize_output)) with open(serialize_output, "r") as print_rules: brill_tagger_reloaded = pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(serialize_output)) taggedtest_reloaded = brill_tagger.tag_sents(testing_data) if taggedtest == taggedtest_reloaded: print("Reloaded tagger tried on test set, results identical") else: print("PROBLEM: Reloaded tagger gave different results on test set")
def train_tagger(language, model_type, feature, train_sents): if model_type == 'unigram': tagger = UnigramTagger(train_sents) elif model_type == 'bigram': tagger = BigramTagger(train_sents) elif model_type == 'trigram': tagger = TrigramTagger(train_sents) elif model_type == 'backoff': tagger1 = UnigramTagger(train_sents) tagger2 = BigramTagger(train_sents, backoff=tagger1) tagger = TrigramTagger(train_sents, backoff=tagger2) elif model_type == 'crf': tagger = CRFTagger() tagger.train(train_sents, 'taggers/{0}/{1}/crf.pickle'.format(language, feature)) elif model_type == 'perceptron': tagger = PerceptronTagger(load=False) tagger.train(train_sents) return tagger
def postag( templates=None, tagged_data=None, num_sents=1000, max_rules=300, min_score=3, min_acc=None, train=0.8, trace=3, randomize=False, ruleformat="str", incremental_stats=False, template_stats=False, error_output=None, serialize_output=None, learning_curve_output=None, learning_curve_take=300, baseline_backoff_tagger=None, separate_baseline_data=False, cache_baseline_tagger=None): """ Brill Tagger Demonstration :param templates: how many sentences of training and testing data to use :type templates: list of Template :param tagged_data: maximum number of rule instances to create :type tagged_data: C{int} :param num_sents: how many sentences of training and testing data to use :type num_sents: C{int} :param max_rules: maximum number of rule instances to create :type max_rules: C{int} :param min_score: the minimum score for a rule in order for it to be considered :type min_score: C{int} :param min_acc: the minimum score for a rule in order for it to be considered :type min_acc: C{float} :param train: the fraction of the the corpus to be used for training (1=all) :type train: C{float} :param trace: the level of diagnostic tracing output to produce (0-4) :type trace: C{int} :param randomize: whether the training data should be a random subset of the corpus :type randomize: C{bool} :param ruleformat: rule output format, one of "str", "repr", "verbose" :type ruleformat: C{str} :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) :type incremental_stats: C{bool} :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing :type template_stats: C{bool} :param error_output: the file where errors will be saved :type error_output: C{string} :param serialize_output: the file where the learned tbl tagger will be saved :type serialize_output: C{string} :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) :type learning_curve_output: C{string} :param learning_curve_take: how many rules plotted :type learning_curve_take: C{int} :param baseline_backoff_tagger: the file where rules will be saved :type baseline_backoff_tagger: tagger :param separate_baseline_data: use a fraction of the training data exclusively for training baseline :type separate_baseline_data: C{bool} :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get deterministic output from the baseline unigram tagger between python versions) :type cache_baseline_tagger: C{string} Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This is fast and fine for a demo, but is likely to generalize worse on unseen data. Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). """ # defaults baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER if templates is None: from nltk.tag.brill import describe_template_sets, brill24 # some pre-built template sets taken from typical systems or publications are # available. Print a list with describe_template_sets() # for instance: templates = brill24() (training_data, baseline_data, gold_data, testing_data) = \ _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data) # creating (or reloading from cache) a baseline tagger (unigram tagger) # this is just a mechanism for getting deterministic output from the baseline between # python versions if cache_baseline_tagger: if not os.path.exists(cache_baseline_tagger): baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) with open(cache_baseline_tagger, 'w') as print_rules: pickle.dump(baseline_tagger, print_rules) print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger)) with open(cache_baseline_tagger, "r") as print_rules: baseline_tagger= pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger)) else: baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) print("Trained baseline tagger") if gold_data: print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data))) # creating a Brill tagger tbrill = time.time() trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat) print("Training tbl tagger...") brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill)) if gold_data: print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) # printing the learned rules, if learned silently if trace == 1: print("\nLearned rules: ") for (ruleno, rule) in enumerate(brill_tagger.rules(),1): print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat))) # printing template statistics (optionally including comparison with the training data) # note: if not separate_baseline_data, then baseline accuracy will be artificially high if incremental_stats: print("Incrementally tagging the test data, collecting individual rule statistics") (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data) print(" Rule statistics collected") if not separate_baseline_data: print("WARNING: train_stats asked for separate_baseline_data=True; the baseline " "will be artificially high") trainstats = brill_tagger.train_stats() if template_stats: brill_tagger.print_template_statistics(teststats) if learning_curve_output: _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take) print("Wrote plot of learning curve to {0}".format(learning_curve_output)) else: print("Tagging the test data") taggedtest = brill_tagger.batch_tag(testing_data) if template_stats: brill_tagger.print_template_statistics() # writing error analysis to file if error_output is not None: with open(error_output, 'w') as f: f.write('Errors for Brill Tagger %r\n\n' % serialize_output) for e in error_list(gold_data, taggedtest): f.write(e+'\n') print("Wrote tagger errors including context to {0}".format(error_output)) # serializing the tagger to a pickle file and reloading (just to see it works) if serialize_output is not None: taggedtest = brill_tagger.batch_tag(testing_data) with open(serialize_output, 'w') as print_rules: pickle.dump(brill_tagger, print_rules) print("Wrote pickled tagger to {0}".format(serialize_output)) with open(serialize_output, "r") as print_rules: brill_tagger_reloaded = pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(serialize_output)) taggedtest_reloaded = brill_tagger.batch_tag(testing_data) if taggedtest == taggedtest_reloaded: print("Reloaded tagger tried on test set, results identical") else: print("PROBLEM: Reloaded tagger gave different results on test set")
import re import nltk import pickle #Importing lemmatizer from nltk.corpus import stopwords from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() #Importing tagger from nltk.tag import UnigramTagger from nltk.corpus import treebank train_sents = treebank.tagged_sents() tagger=UnigramTagger(train_sents) #Importing replacers from replacers import RegexReplacer from replacers import AntonymReplacer replacer=RegexReplacer() from nltk.tokenize import RegexpTokenizer tokenizer =RegexpTokenizer("[\w']+") from random import shuffle #Importing Chunkers import chunkers from nltk.corpus import treebank_chunk chunker=chunkers.TagChunker(treebank_chunk.chunked_sents()) max_key=100
(r'.*s$', 'NNS'), # plural nouns (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'.*', 'NN') # nouns (default) ... ] rt = RegexpTagger(patterns) print rt.evaluate(test_data) print rt.tag(tokens) ## N gram taggers from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.tag import TrigramTagger ut = UnigramTagger(train_data) bt = BigramTagger(train_data) tt = TrigramTagger(train_data) print ut.evaluate(test_data) print ut.tag(tokens) print bt.evaluate(test_data) print bt.tag(tokens) print tt.evaluate(test_data) print tt.tag(tokens) def combined_tagger(train_data, taggers, backoff=None): for tagger in taggers: backoff = tagger(train_data, backoff=backoff)
# nouns only nouns_bow = cleaned_bow for word in cleaned_bow.columns: pos = pos_tag(list(word))[0][1] if pos != 'NN': nouns_bow = nouns_bow.drop(word, axis=1) topic_words(NMF_vars(10, nouns_bow)[0], nouns_bow) topic_words( NMF_vars(5, nouns_bow.drop('girl', axis=1))[0], nouns_bow.drop('girl', axis=1)) # try different tagger nouns_bow_2 = cleaned_bow tagger = UnigramTagger(brown.tagged_sents()) for word in cleaned_bow.columns: pos = tagger.tag(list(word))[0][1] if pos != 'NN': nouns_bow_2 = nouns_bow_2.drop(word, axis=1) for num in range(2, 6): topic_words(NMF_vars(num, nouns_bow_2)[0], nouns_bow_2) topic_words(NMF_vars(10, nouns_bow_2)[0], nouns_bow_2) # remove the word 'total' topic_words( NMF_vars(10, nouns_bow_2.drop('total', axis=1))[0], nouns_bow_2.drop('total', axis=1))
def __init__(self, corpus, isPl): if isPl: self.tagger = FirstTagger(corpus) else: dtag = DefaultTagger("NN") self.__utag = UnigramTagger(corpus.tagged_sents(), backoff = dtag)
import nltk import json from nltk.corpus import brown from nltk.tag import UnigramTagger tagger = UnigramTagger(brown.tagged_sents(tagset='universal')) sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'] for word, tag in tagger.tag(sent): if tag == "VERB": print(word, '->', tag) verbs_tagged = open("../assets/inputText/verbs_tagged_questions.txt", 'w+') with open("../assets/inputText/all_questions.txt", 'r') as all_lines: for line in all_lines: splitLine = line.split(' ') for word, tag in tagger.tag(splitLine): if tag == "VERB": verbs_tagged.write(word + "\n") #verbs_tagged.write(word + " \"" + line[:-1] + "\"\n")
import string '''import replacer from replacer import RegexpReplacer from replacer import RepeatReplacer''' import linecache import matplotlib.pyplot as plt ''' Train Tagger ''' from nltk.tag import DefaultTagger from nltk.tag import UnigramTagger from nltk.tag import BigramTagger from nltk.corpus import treebank train = treebank.tagged_sents()[:10000] t0 = DefaultTagger('NN') t1 = UnigramTagger(train, backoff=t0) t2 = BigramTagger(train, backoff=t1) ''' Initialize ''' my_corp = web.sents(fileids='firefox.txt') sent_count = 0 ques_count = 0 All_count = 1 NN_count = 0 NNS_count = 0 NNP_count = 0 VB_count = 0 VBN_count = 0 VBG_count = 0 VBD_count = 0
#seja procurado em cada um dos filhos da árvore e concatena o resultado #se este for favorável for child in myTree: if (type(child) is Tree): list_of_phrases = ExtractPhrases(child, phrase) if (len(list_of_phrases) > 0): myPhrases.extend(list_of_phrases) #Retorna a lista de padrões encontrados return myPhrases #Cria o etiquetador padrão para que palavras não conhecidas sejam tratadas com substantivo(N) etiqPadrao = DefaultTagger('N') #Pega o trainning set a partir das tagged_sents() do mac_morpho sentencas_treinadoras = mac_morpho.tagged_sents()[0:15000] #Cria o UnigramTagger com base no etiquetador padrão e treina-o com as sentenças etiquetadas do mac_morpho etiq = UnigramTagger(sentencas_treinadoras, backoff=etiqPadrao) coment = str(input("Entre com o texto: ")) if coment == "default": coment = open("default.txt", "r").read().replace("\n", " ") #O texto é convertido em tokens tokens=nltk.word_tokenize(coment.lower()) #É etiquetada cada token do texto tags = etiq.tag(tokens) #É criado o analisador de expresões regulares contendo os padrões procurados analiseGramatical = RegexpParser(r""" PADRAO7: {<N><ADJ>} PADRAO1: {<ADJ><N>(<PREP>?<N>)*} PADRAO2: {<ADV><ADV>?<ADJ>(<N>(<PREP>?<N>)*)?} PADRAO3: {<N>(<PREP>?<N>)*(<ADJ>)<ADV><ADV>?}