def __init__(self, lda_based_context): self.lda_based_context = lda_based_context self.doc =\ Document(Constants.ITEM_TYPE + '-topic-models-nouns-complete') self.num_cols = 5 self.num_topics = Constants.LDA_NUM_TOPICS self.rgb_tuples = None self.automatic_context_topic_colors = None self.keyword_context_topic_colors = None self.manual_context_topic_colors = None self.automatic_context_topic_ids = None self.keyword_context_topic_ids = None self.manual_context_topic_ids = None self.automatic_context_topic_words = None self.keyword_context_topic_words = None self.manual_context_topic_words = None self.headers = None self.topic_words_map = None self.table_format = '|c|' + 'c|' * (self.num_cols + 1) self.tagger = nltk.PerceptronTagger() self.tag_count_map = {'NN': 0, 'JJ': 0, 'VB': 0} self.init_colors() self.init_headers() self.init_topic_words() self.init_topic_ids() new_comm = UnsafeCommand( 'newcommand', '\exampleCommand', options=4, extra_arguments=r'\colorbox[rgb]{#1,#2,#3}{#4} \color{black}') self.doc.append(new_comm) new_comm2 = UnsafeCommand('tiny') self.doc.append(new_comm2)
def create_vocab_directional(corpus): vocab = {} tagger = nltk.PerceptronTagger() # only nouns,verbs and adjectives with open(corpus, 'r') as f: for line in f: sentences = nltk.sent_tokenize(line) for sentence in sentences: sentence = alt_alpha(sentence).lower() words = sentence.split() pos_words = tagger.tag(words) words = [] for word, pos in pos_words: if pos.startswith('NN') or pos.startswith( 'JJ') or pos.startswith('VB'): words.append(word) words = [ lemmatizer.lemmatize(word) for word in words if word not in stopwords ] for word in words: if word in vocab: pass else: vocab[word] = len(vocab) vocab[str(word) + '/l'] = len(vocab) vocab[str(word) + '/r'] = len(vocab) return vocab
class APTaggerUtils(object): tagger = nltk.PerceptronTagger() def tag(self, tokens, tagset=None): tagged_tokens = APTaggerUtils.tagger.tag(tokens) if tagset: tagged_tokens = [(token, nltk.map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens] return tagged_tokens
def create_vocab_count(corpus): vocab_count={} tagger=nltk.PerceptronTagger() # only nouns,verbs and adjectives with open(corpus,'r') as f: for line in f: sentences=nltk.sent_tokenize(line) for sentence in sentences: sentence=clean_sentence(sentence).lower() words=sentence.split() pos_words=tagger.tag(words) words=[] for word,pos in pos_words: if pos.startswith('NN') or pos.startswith('JJ') or pos.startswith('VB'): words.append(word) words=[word for word in words if word not in stopwords] for word in words: if word in vocab_count: vocab_count[word]+=1 else: vocab_count[word]=1 return vocab_count
def tag_words(text, tagger=None): """ Tags the words contained in the given text using part-of-speech tags. The text is split into sentences and it returns a list of lists with the tagged words. One list for every sentence. :param tagger: a part-of-speech tagger. This parameter is useful in order to avoid the initialization of the tagger every time this method is called, since the initialization can take a long time. :param text: the text to tag :return: a list of lists with pairs, in the form of (word, tag) """ sentences = get_sentences(text) tokenized_sentences = [ get_words_from_sentence(sent.lower()) for sent in sentences ] if tagger is None: tagger = nltk.PerceptronTagger() tagged_words = [] for sent in tokenized_sentences: tagged_words.extend(tagger.tag(sent)) return tagged_words
def create_bag_of_words(document_list): """ Creates a bag of words representation of the document list given. It removes the punctuation and the stop words. :type document_list: list[str] :param document_list: :rtype: list[list[str]] :return: """ tokenizer = RegexpTokenizer(r'\w+') tagger = nltk.PerceptronTagger() cached_stop_words = set(stopwords.words("english")) cached_stop_words |= { 't', 'didn', 'doesn', 'haven', 'don', 'aren', 'isn', 've', 'll', 'couldn', 'm', 'hasn', 'hadn', 'won', 'shouldn', 's', 'wasn', 'wouldn' } body = [] processed = [] for i in range(0, len(document_list)): body.append(document_list[i].lower()) for entry in body: row = tokenizer.tokenize(entry) tagged_words = tagger.tag(row) nouns = [] for tagged_word in tagged_words: if tagged_word[1].startswith('NN'): nouns.append(tagged_word[0]) nouns = [word for word in nouns if word not in cached_stop_words] processed.append(nouns) return processed
def __init__(self): super().__init__(nltk.PerceptronTagger(), self.name)
print() start_time = time.process_time() print("* TnT tagger (not in NLTK book)") tnt_tagger = nltk.TnT() tnt_tagger.train(train_sents) print("Seen:", show_example(tnt_tagger.tag(seen_example))) print("Unseen:", show_example(tnt_tagger.tag(unseen_example))) print("Evaluation: {:.1%}".format(tnt_tagger.evaluate(test_sents))) print("Elapsed time: {:.0f} s".format(time.process_time() - start_time)) print() start_time = time.process_time() print("* HMM tagger (not in NLTK book)") hmm_tagger = nltk.HiddenMarkovModelTagger.train(train_sents) print("Seen:", show_example(hmm_tagger.tag(seen_example))) print("Unseen:", show_example(hmm_tagger.tag(unseen_example))) print("Evaluation: {:.1%}".format(hmm_tagger.evaluate(test_sents))) print("Elapsed time: {:.0f} s".format(time.process_time() - start_time)) print() start_time = time.process_time() print("* Perceptron tagger (not in NLTK book)") perp_tagger = nltk.PerceptronTagger(load=False) perp_tagger.train(train_sents) print("Seen:", show_example(perp_tagger.tag(seen_example))) print("Unseen:", show_example(perp_tagger.tag(unseen_example))) print("Evaluation: {:.1%}".format(perp_tagger.evaluate(test_sents))) print("Elapsed time: {:.0f} s".format(time.process_time() - start_time)) print()
def perceptron_tag(train_): perc_tagger = nltk.PerceptronTagger(load=False) perc_tagger.train(train_) return perc_tagger
Notes: Can raise an exception if Java Development Kit is not installed or not properly configured. Examples: >>> try: ... StanfordPOSTagger.check('path/to/model', 'path/to/stanford.jar') ... except ValueError as e: ... print(e) Could not find stanford-postagger.jar jar file at path/to/stanford.jar """ try: cls(path_to_model, path_to_jar).tag(()) except OSError as e: raise ValueError( 'Either Java SDK not installed or some of the files are invalid.\n' + str(e)) except LookupError as e: raise ValueError(str(e).strip(' =\n')) def __str__(self): return "{} (model: {})".format(self.name, self._stanford_model) taggers = [ POSTagger(nltk.PerceptronTagger(), 'Averaged Perceptron Tagger'), POSTagger( nltk.data.load('taggers/maxent_treebank_pos_tagger/english.pickle'), 'Treebank POS Tagger (MaxEnt)'), ]
# 5.1. Explain in your own words how the Averaged perceptron algorithm works # The Averaged Perceptron predicts the tag of a word based on the information it has for the rest of the words. # It does this using the function predict which predicts the best tag by calculating the Dot-product of the given featres. # During training we predict a tag if this tag is correct we increase it's weights if not we dicreased the weights. # # 5.2. Train the averaged perceptron tagger on the Brown dataset (full stratified training dataset with 90% of the sentences). # In[38]: from nltk.tag.perceptron import PerceptronTagger # In[39]: PerceptronAv = nltk.PerceptronTagger(stratified_split_train) # 5.3. Report on accuracy, and per tag Precision, Recall, F and confusion matrix. # # The PerceptronTagger uses a different set of tags (Penn TreeBank) and all our previous work uses the Universal tagset, # so we want to map the Penn TreeBank tagset to the Universal tagset. This mapping will help us re-use previous code without change. # In[40]: from nltk.tag import mapping tag_dict = mapping.tagset_mapping('en-ptb', 'universal') # In[41]:
class TextBayes: """ Naive Bayes Classifier for text. """ tagger = nltk.PerceptronTagger() lemmatizer = nltk.WordNetLemmatizer() stop_words = stopwords.words('english') UNKNOWN_TOKEN = -1 def __init__(self, smoothing='add-one'): self._smoothing = smoothing if self._smoothing not in [None, 'add-one']: raise Exception('Unknown smoothing option: {}'.format(smoothing)) self._classes = [] """ A list of classes that need to be distinguished """ self._priors = {} """ Dictionary from class (string) to prior, sum of all priors 1 """ self._cond_probabilities = {} """ Dictionary from class (string) to dictionary from token (string) to probability, for example: _cond_probabilities['formation']['together'] = P[token together | class formation] If smoothing is not None, the inner dictionaries have one extra key: UNKNOWN_TOKEN, which has its own probability """ def train(self, paragraphs, classes): """ :param paragraphs: A list of paragraphs (strings), where each paragraph hash a different class :param classes: A list, same length as x, where each entry is the class name (string) for each paragraph in x """ if len(paragraphs) != len(classes): raise Exception( 'Parameters `paragraphs` and `classes` should match in size ({}, {}).' .format(len(paragraphs), len(classes))) class_counts = Counter(classes) self._classes = list(class_counts.keys()) for c in self._classes: self._priors[c] = class_counts[c] / len(classes) for c in self._classes: self._cond_probabilities[c] = {} # create a bag of words for each class word_bags = {} for clazz in self._classes: word_bags[clazz] = [] for paragraph_i in range(len(paragraphs)): paragraph_strip = TextBayes.break_down(paragraphs[paragraph_i]) clazz = classes[paragraph_i] word_bags[clazz].extend(paragraph_strip) # create a multiset for each bag of words for clazz in self._classes: word_bags[clazz] = Counter(word_bags[clazz]) # compute conditional probability for each word in each sack for clazz in self._classes: bag_size = sum(word_bags[clazz].values()) types_count = len(word_bags[clazz]) for token, count in word_bags[clazz].items(): if self._smoothing is None: self._cond_probabilities[clazz][token] = count / bag_size else: # add-one smoothing self._cond_probabilities[clazz][token] = (count + 1) / ( bag_size + types_count) if self._smoothing == 'add-one': self._cond_probabilities[clazz][ TextBayes.UNKNOWN_TOKEN] = 1 / (bag_size + types_count) def conditional_probability(self, clazz, token): if len(self._classes) == 0: raise Exception('The classifier has not been trained yet.') if clazz not in self._classes: raise Exception('Unknown class.') if self._smoothing is None: return 0 if token not in self._cond_probabilities[ clazz] else self._cond_probabilities[clazz][token] else: # add-one return self._cond_probabilities[clazz][TextBayes.UNKNOWN_TOKEN] \ if token not in self._cond_probabilities[clazz] else self._cond_probabilities[clazz][token] def predict(self, paragraph): """ Calculates the most probable class that the paragraph belongs to :param paragraph: A string made up of one or more sentences :return: A prediction of the class that the paragraph belongs to """ probabilities = self.belong_probabilities(paragraph) return argmax(probabilities) def belong_probabilities(self, paragraph): """ :param paragraph: A string made up of one or more sentences :return: A dictionary from class names to probability, stating the probability of the paragraph belonging to each class """ # To prevent underflow, we use loglikelihoods instead of likelihoods, and so we add up log-probability instead # of multiplying probability loglikelihoods = {} for clazz in self._classes: cur_likelihood = 0 for token in TextBayes.break_down(paragraph): cond_probability = self.conditional_probability(clazz, token) cur_likelihood += log2(cond_probability) cur_likelihood += log2(self._priors[clazz]) loglikelihoods[clazz] = cur_likelihood # Because we are interested in the ratio between the different likelihoods, we can divide all of the likelihoods # by a constant amount, which is the same as subtracting the loglikelihoods by a constant amount (specifically # we subtract by the maximum loglikelihood) # Then we exponentiate 2 by the new values to get normalized likelihood values likelihoods_normalized = {} max_likelihood = max(loglikelihoods.values()) for clazz, loglike in loglikelihoods.items(): likelihoods_normalized[clazz] = 2**(loglike - max_likelihood) # Compute the ratios between the likelihoods to get probabilities ans = {} sum_norm_likelihoods = sum(likelihoods_normalized.values()) for clazz, norm_likelihood in likelihoods_normalized.items(): ans[clazz] = norm_likelihood / sum_norm_likelihoods return ans @staticmethod def break_down(paragraph): def break_down_weak(paragraph): """ Use natural language processing tools to break down the paragraph into a sequence of tokens :param paragraph: A string made up of one or more sentences :return: A list of tokens (strings) from the paragraph """ tokens = word_tokenize(paragraph) return tokens def break_down_strong(paragraph): """ Use natural language processing tools to break down the paragraph into a sequence of lemmatized words. Removes English stop words, punctuation, and numbers. :param paragraph: A string made up of one or more sentences :return: A list of words (strings) from the paragraph """ tokens = word_tokenize(paragraph) parts_of_speech = TextBayes.tagger.tag(tokens) parts_of_speech = [(t[0], get_wordnet_tag(t[1])) for t in parts_of_speech] lemmatized = [ TextBayes.lemmatizer.lemmatize(t[0], pos=t[1]) for t in parts_of_speech ] lowercase = [t.lower() for t in lemmatized] return [ t for t in lowercase if t not in string.punctuation and t not in TextBayes.stop_words and not is_number(t) ] return break_down_weak(paragraph) @staticmethod def from_file(xml_file, smoothing='add-one'): """ Create a TextBayes classifier from a given corpus :param xml_file: XML file path, that has the corpus. The file's structure: corpus > lexelt > instances :param smoothing: Smoothing technique for the classifier :return: Trained TextBayes object """ instance_list = get_instance_list(xml_file) paragraphs = [get_paragraph(instance) for instance in instance_list] senses = [get_sense(instance) for instance in instance_list] ans = TextBayes(smoothing=smoothing) ans.train(paragraphs, senses) return ans
def __call__(self, tokens): if self._tagger is None: self._tagger = nltk.PerceptronTagger() return [tag for _, tag in self._tagger.tag(tokens)]
def __init__(self): NltkNormalizer.install_nltk_corpora('averaged_perceptron_tagger') self.normalizer = NltkNormalizer() self.lem = nltk.WordNetLemmatizer() self.tagger = nltk.PerceptronTagger() self.translation_dict = {'J': wn.ADJ, 'N': wn.NOUN, 'R': wn.ADV, 'V': wn.VERB}
print(f"Training data size: {len(training_data)}") print(f"Validation data size: {len(test_data)}") print( f"Expected size: {len(training_data)+len(test_data)}, actual size: {len(brown_news_tagged)}" ) # Instantiate and train the following taggers: Unigram; TnT; Perceptron; CRF print("\nTRAINING MODELS") unigram = nltk.UnigramTagger(training_data) print("Unigram Tagger trained") TnT = nltk.TnT() TnT.train(training_data) print("TnT Tagger trained") perceptron = nltk.PerceptronTagger(training_data) print("Perceptron Tagger trained") #CRF = nltk.CRFTagger() #CRF.train(training_data, "model.crf.tagger") #print("CRF Tagger trained") # Save the trained taggers (in a LABA Taggers Map), overwrite existing import pickle from pickle import dump print("\nSAVING MODELS") ugOutput = open("Unigram.pkl", 'wb') dump(unigram, ugOutput, -1) ugOutput.close() print("Trained Unigram Tagger Saved")
def train(self): # train_data = nltk.corpus.brown.tagged_sents(categories=['news','science_fiction']) self.tagger = nltk.PerceptronTagger() self._trained = True return None
# Import packages import nltk import spacy import itertools from collections import Counter import os from scipy.sparse import csr_matrix, dok_matrix, save_npz import argparse import re import gc import numpy as np import argparse from tqdm import tqdm import pickle tagger = nltk.PerceptronTagger() lemmatizer = nltk.stem.WordNetLemmatizer() stopwords = nltk.corpus.stopwords.words('english') # Pre-procesing def clean_sentence(sentence): new_sent = [] words = sentence.split() words = list(itertools.chain.from_iterable([w.split(',') for w in words])) words = list(itertools.chain.from_iterable([w.split('-') for w in words])) for word in words: new_sent.append(''.join(w for w in word if w.isalpha())) return re.sub('\s\s+', r' ', ' '.join(new_sent).strip())
training = brown_tagged[int(length / 5):] test = brown_tagged[:100] already_trained = os.path.isfile('unigram_tagger.pkl') and os.path.isfile( 'tnt_tagger.pkl') and os.path.isfile('perceptron_tagger.pkl') if (not already_trained): # Training unigram = nltk.UnigramTagger(training) print("Trained Unigram.") tnt = nltk.TnT() tnt.train(training) print("Trained TnT.") perceptron = nltk.PerceptronTagger() perceptron.train(training) print("Trained Perceptron.") # CRF skipped due to lack of time to train. # crf = nltk.CRFTagger() # crf.train(training, 'model.crf.tagger') # print("Trained CRF.") # Dump trained models as files for later use. unigram_output = open('unigram_tagger.pkl', 'wb') tnt_output = open('tnt_tagger.pkl', 'wb') perceptron_output = open('perceptron_tagger.pkl', 'wb') dump(unigram, unigram_output, -1) unigram_output.close()
def generateList(self, command): if command == '': command = self.command tags = nltk.word_tokenize(command) pos_tag = nltk.PerceptronTagger().tag(tags) return pos_tag