def get_high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): ''' Gets the high information words using chi square measure ''' word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # raise NotImplementedError('HMM.emission_model') # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences # Repack the train_data to list(tuple(tag, lowercase_word)) format tagged_words = chain(train_data) data = [(tag, word.lower()) for (word, tag) in tagged_words] # Train the emission probilistic model emission_FD = ConditionalFreqDist(data) # Reseal the lidston function with gamma 0.01 and a proper bin number lidstone_PD = lambda FD: LidstoneProbDist( FD, gamma=0.01, bins=FD.B() + 1) self.emission_PD = ConditionalProbDist(emission_FD, lidstone_PD) # Store the tags as states self.states = emission_FD.conditions() return self.emission_PD, self.states
def emission_model(self, train_data): """ Compute an emission model using a ConditionalProbDist. :param train_data: The training dataset, a list of sentences with tags :type train_data: list(list(tuple(str,str))) :return: The emission probability distribution and a list of the states :rtype: Tuple[ConditionalProbDist, list(str)] """ # raise NotImplementedError('HMM.emission_model') # TODO prepare data # Don't forget to lowercase the observation otherwise it mismatches the test data # Do NOT add <s> or </s> to the input sentences data = [] for sent in train_data: sent_parsed = list(map(lambda x: (x[1], x[0].lower()), sent)) data.extend(sent_parsed) # TODO compute the emission model #print('pair num:', len(data)) cfdist = ConditionalFreqDist(data) #print(cfdist.conditions()) #print(len(dict(cfdist['ADP']))) cpdist = ConditionalProbDist(cfdist, myProbDist1, 0.01) emission_FD = cpdist self.emission_PD = emission_FD self.states = list(cfdist.conditions()) #print(self.elprob('VERB','is')) #exit() return self.emission_PD, self.states
def high_information_words(labeled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): """ To eliminate low information feature words for set of words for EFFICIENCY :param labeled_words: list of 2 tuples [(label, words)] label -> is a classification label (pos / neg) words -> is a list of words that occur under that label :param score_fn: a scoring function to measure how informative that word is :param min_score: the minimum score for a word to be included as MOST INFORMATIVE WORD :return: a set of high informative words """ print "Counting Word Frequencies" word_fq = FreqDist() labeled_word_fq = ConditionalFreqDist() for label, words in labeled_words: for word in words: word_fq[word] += 1 labeled_word_fq[label][word] += 1 n_xx = labeled_word_fq.N() high_info_words = set() for label in labeled_word_fq.conditions(): n_xi = labeled_word_fq[label].N() word_scores = collections.defaultdict(int) for word, n_ii in labeled_word_fq[label].iteritems(): n_ix = word_fq[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.iteritems() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def _setSelectedPOSTags(self): buff = self._loadData('selective_pos.bin') if buff: self.selective_pos = buff return #First get all (word, tag) in corpuses sentences = brown.tagged_sents(simplify_tags=True) self.selected_tags = ["ADJ","ADV", "CNJ"] self.selective_pos = ConditionalFreqDist() temp_dist = ConditionalFreqDist() for sentence in sentences: for (word, tag) in sentence: if tag in self.selected_tags: temp_dist[tag].inc(str(word).lower()) #Now, get the words with frequency > 10 for category in temp_dist.conditions(): fredist = temp_dist[category] for key in fredist.keys(): if fredist[key] > 4: self.selective_pos[category].inc(key) self._saveData('selective_pos.bin',self.selective_pos)
def high_information_words(files, score_fn=BigramAssocMeasures.chi_sq, min_score=50): word_dict = FreqDist() ocean_word_dict = ConditionalFreqDist() hiw_categories = [] for file in files: # For each token, add 1 to the overall FreqDist and 1 to the ConditionalFreqDist under the current personality trait for token in file[0]: for trait in file[1]: ocean_word_dict[trait][token] += 1 word_dict[token] += 1 n_xx = ocean_word_dict.N() # Get the total number of recordings in the ConditionalFreqDist high_info_words = set() for condition in ocean_word_dict.conditions(): n_xi = ocean_word_dict[condition].N() # Get the number of recordings for each personality trait word_scores = defaultdict(int) for word, n_ii in ocean_word_dict[condition].items(): n_ix = word_dict[word] # Get total number of recordings of a token score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] bw = list({k for k, v in sorted(word_scores.items(), key=lambda x: x[1], reverse=True)}) high_info_words |= set(bestwords) hiw_categories.append((condition, bw[:10])) return high_info_words, hiw_categories
def get_high_information_words(lwords, score_fn=BigramAssocMeasures.chi_sq, min_score=5): labels = lwords.keys() labelled_words = [(l, lwords[l]) for l in labels] word_freq_dist = FreqDist() label_word_freq_dist = ConditionalFreqDist() for label, dwords in labelled_words: for words in dwords: for word in words: word_freq_dist[word] += 1 label_word_freq_dist[label][word] += 1 n_words_total = label_word_freq_dist.N() high_info_words = set() for label in label_word_freq_dist.conditions(): n_words_label = label_word_freq_dist[label].N() word_scores = defaultdict(int) for word, word_freq_label in label_word_freq_dist[label].items(): word_freq = word_freq_dist[word] score = score_fn(word_freq_label, (word_freq, n_words_label), n_words_total) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn( n_ii, (n_ix, n_xi), n_xx ) # n_ii is occurances in a label, n_ix is occurance in total, # n_xi is total words in this category, n_xx total words word_scores[word] = score bestwords = [ word for word, score in word_scores.items() if score >= min_score ] high_info_words |= set(bestwords) # bitwise or operation return high_info_words
def readFormatedData(formatedData): #unigramFd = FreqDist() #bigramFd = FreqDist() cBigramFd1 = ConditionalFreqDist() cBigramFd2 = ConditionalFreqDist() #dict1 = Set([]) #dict2 = Set([]) for tuple in formatedData: words = tuple[0].split(' ') count = int(tuple[1]) #unigramFd.inc(words[0]) #unigramFd.inc(words[1]) #bigramFd.inc((words[0], words[1]), count) word2 = words[1] if count < 5: word2 = "unknown" cBigramFd1[words[0]].inc(word2, count) #if words[0] not in dict1: # dict1.add(words[0]) #if words[1] not in dict2: # dict2.add(words[1]) for w1 in cBigramFd1.conditions(): bigram_w1 = cBigramFd1[w1] for w2 in bigram_w1.samples(): cBigramFd2[w2].inc(w1, bigram_w1[w2]) return cBigramFd1, cBigramFd2#, dict1, dict2
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd.inc(word) label_word_fd[label].inc(word) n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].iteritems(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.iteritems() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def Ae_kappa(self, cA, cB): Ae = 0.0 nitems = float(len(self.I)) label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data) for k in label_freqs.conditions(): Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) return Ae
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one( tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0) / token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print("[Trained Unigram tagger:", end=' ') print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning))
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if self.backoff is None or tag != self.backoff.tag_one( tokens, index, tags[:index]): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0) / token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print("[Trained Unigram tagger:", end=" ") print("size={}, backoff={:.2f}%, pruning={:.2f}%]".format( size, backoff, pruning))
def _dump_cfdist(cfdist: ConditionalFreqDist) -> dict: data: dict = {} for cond in cfdist.conditions(): for k, v in cfdist[cond].items(): if cond not in data: data[cond] = {} if k not in data[cond]: data[cond][k] = 0 data[cond][k] += v return data
class hmm: def __init__(self, name=0, tag=0): self.name = name self.tag = tag self.wsj = nltk.corpus.brown.tagged_words() self.sentences = nltk.corpus.brown.sents() #self.cfdTagAll = ConditionalFreqDist(tag, word for (word, tag) in self.wsj) def findTags(self, mostCommon=5): if self.tag != 0: self.tag_prefix = self.tag self.cfdTag = ConditionalFreqDist( (tag, word) for (word, tag) in self.wsj if tag.startswith(self.tag_prefix)) return dict((tag, self.cfdTag[tag].most_common(mostCommon)) for tag in self.cfdTag.conditions()) else: print("invalid method") def findAllTags(self, mostCommon=5): self.cfdTagAll = ConditionalFreqDist( (tag, word) for (word, tag) in self.wsj) for tag in sorted(self.cfdTagAll): print(tag, self.cfdTagAll[tag].most_common()) #print(self.cfdTagAll) return dict(self.cfdTagAll) def findBigrams(self): self.bigram = bigrams([tag for word, tag in self.wsj]) return self.bigram def biFrekvens(self, mostCommon=5): self.cfdBigram = ConditionalFreqDist(self.bigram) return dict((tag, self.cfdBigram[tag].most_common(mostCommon)) for tag in self.cfdBigram) def findName(self, mostCommon=5): if self.name != 0: self.cfdName = ConditionalFreqDist( (word.lower(), tag) for (word, tag) in self.wsj) return [self.name, self.cfdName[self.name].most_common(mostCommon)] else: print("invalid method") def findCPD(self, typecfd=None): if (typecfd == None): self.cpdTag = nltk.ConditionalProbDist(self.cfdTag, nltk.MLEProbDist) return self.cpdTag elif (typecfd == "bi"): return ConditionalProbDist(self.cfdBigram, nltk.MLEProbDist) else: print("invalid method")
def _train(self, tagged_corpus, cutoff=0, verbose=False): token_count = hit_count = 0 useful_contexts = set() fd = ConditionalFreqDist() tag_prob = FreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 tag_prob.inc(tag) context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context].inc(tag) # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, # calculate the entropy. Only include contexts that # lower then `cutoff` . total_tags = float(sum(tag_prob.values())) tags_probs = [(t,tag_prob[t]/total_tags) for t in tag_prob.keys()] useful_contexts_after_filter = useful_contexts.copy() most_high = FreqDist() for context in useful_contexts: dd = fd[context] # total_tags = float(sum(dd.values())) # tags_probs = [(t,dd[t]/total_tags) for t in dd.keys()] h = self.H(dd.keys(),tags_probs) if h > cutoff: useful_contexts_after_filter.remove(context) continue most_high[context] = h print most_high.keys() # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. for context in useful_contexts_after_filter: best_tag = fd[context].max() hits = fd[context][best_tag] self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % (size, backoff, pruning)
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=0): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, sentences in labelled_words: for sent in sentences: words = preProcess(sent) for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() labelScore = [] for label in sorted(label_word_fd.conditions()): # if label == 0: # min_score = 1.0 # elif label == 1: # min_score = 1.0 # elif label == 2: # min_score = 1.0 # elif label == 3: # min_score = 1.0 # elif label == 4: # min_score = 1.0 n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score >= min_score] high_info_words |= set(bestwords) labelScore.append(word_scores) which = 0 for x in labelScore: sorted_x = sorted(x.items(), key=operator.itemgetter(1), reverse=True) labelCSV = pd.DataFrame(sorted_x) fileName = "wang2226_%d.csv" % which labelCSV.to_csv(fileName, index=False, sep=',') which += 1 return high_info_words
def validate_pcfg_generate(grammar): pd = makeLhrProbDict(grammar) productions = [] cfd = ConditionalFreqDist() for i in np.arange(1000): tree = pcfg_generate(grammar) productions += tree.productions() for p in productions: cfd[p.lhs()].inc(p.rhs()) for c in cfd.conditions(): p = MLEProbDist(cfd[c]) q = pd[c] div = KL_Divergence(p, q) print "KL_Divergence for %s = %f" % (c, div)
def validate_pcfg_generate(grammar): pd = makeLhrProbDict(grammar) productions = [] cfd = ConditionalFreqDist() for i in np.arange(1000): tree = pcfg_generate(grammar) productions += tree.productions() for p in productions: cfd[p.lhs()].inc(p.rhs()) for c in cfd.conditions(): p = MLEProbDist(cfd[c]) q = pd[c] div = KL_Divergence(p,q) print "KL_Divergence for %s = %f" %(c , div)
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) tags = [",".join(sorted(x.split(","))) for x in tags] for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: #best_tag = fd[context].max() for (tag, hits) in fd[context].items(): if hits > cutoff: self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {}) self._contexts_to_tags[context][tag] = hits hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print("[Trained Unigram tagger:") print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning))
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: #best_tag = fd[context].max() for (tag, hits) in fd[context].items(): if hits > cutoff: self._contexts_to_tags[context] = self._contexts_to_tags.get(context, {}) self._contexts_to_tags[context][tag] = hits hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.raw_freq, min_score=5): word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd.inc(word) label_word_fd[label].inc(word) n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].iteritems(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.iteritems() if score >= min_score] high_info_words |= set(bestwords) return high_info_words
def sum_category_word_scores(categorized_words, score_fn): word_fd = FreqDist() category_word_fd = ConditionalFreqDist() for category, words in categorized_words: for word in words: word_fd.inc(word) category_word_fd[category].inc(word) scores = collections.defaultdict(int) n_xx = category_word_fd.N() for category in category_word_fd.conditions(): n_xi = category_word_fd[category].N() for word, n_ii in iteritems(category_word_fd[category]): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) return scores
def sum_category_word_scores(categorized_words, score_fn): word_fd = FreqDist() category_word_fd = ConditionalFreqDist() for category, words in categorized_words: for word in words: word_fd.inc(word) category_word_fd[category].inc(word) scores = collections.defaultdict(int) n_xx = category_word_fd.N() for category in category_word_fd.conditions(): n_xi = category_word_fd[category].N() for word, n_ii in category_word_fd[category].iteritems(): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) return scores
def significantWords(untagged_docs, min_chisq=5, ratio=0.75): """ Use chisq test of bigram contingency table to measure the association of token with its sentiment Parameters ---------- untagged_docs: list of tuples (words, tag) min_chisq: lower bound of significant ratio: pos/neg ratio, used to determine the sentiment of a word Returns ------- significant_words: a 3-key-dict of words set """ significant_words = collections.defaultdict(set) freq_dist = FreqDist() label_freq_dist = ConditionalFreqDist() stopping_words = set(nltk.corpus.stopwords.words('english')) for tokens, label in untagged_docs: for token in tokens: if token.isalpha() and not (token in stopping_words): freq_dist.inc(token) label_freq_dist[label].inc(token) n_xx = label_freq_dist.N() #pdb.set_trace() for label in label_freq_dist.conditions(): for word, n_ii in label_freq_dist[label].iteritems(): n_xi = label_freq_dist[label].N() n_ix = freq_dist[word] n_oi = n_xi-n_ii n_io = n_ix-n_ii n_oo = n_xx-n_oi-n_io-n_ii chisq = float(n_xx*(n_ii*n_oo - n_io*n_oi)**2)\ /((n_ii+n_io)*(n_ii+n_oi)*(n_oo+n_io)*(n_oo+n_oi)) if chisq > min_chisq and n_ii>10: significant_words['total'] |= set([word]) if float(n_ii)/n_ix > ratio and (n_ix-n_ii) > 1: significant_words[label] |= set([word]) return significant_words
def sum_category_word_scores(categorized_words, score_fn): # get word freq word_fd = FreqDist() # get conditional freq Dist category_word_fd = ConditionalFreqDist() # according to catagory for category, words in categorized_words: for word in words: word_fd.inc(word) category_word_fd[category].inc(word) scores = collections.defaultdict(int) n_xx = category_word_fd.N() for category in category_word_fd.conditions(): n_xi = category_word_fd[category].N() for word, n_ii in category_word_fd[category].iteritems(): n_ix = word_fd[word] scores[word] += score_fn(n_ii, (n_ix, n_xi), n_xx) # return the scores return scores
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): """ returns a set of words with the highest information """ """ n_ii : frequency for the word for the label n_ix : total freq for the word across all labels n_xi : total freq of all words that occured for the label n_xx : total freq for all words in all labels """ word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [ word for word, score in word_scores.items() if score >= min_score ] high_info_words |= set(bestwords) return high_info_words
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5): # gathers the most frequently occuring features to improve classification word_fd = FreqDist() label_word_fd = ConditionalFreqDist() for label, words in labelled_words: for word in words: word_fd[word] += 1 label_word_fd[label][word] += 1 n_xx = label_word_fd.N() high_info_words = set() for label in label_word_fd.conditions(): n_xi = label_word_fd[label].N() word_scores = collections.defaultdict(int) for word, n_ii in label_word_fd[label].items(): n_ix = word_fd[word] score = score_fn(n_ii, (n_ix, n_xi), n_xx) word_scores[word] = score bestwords = [word for word, score in word_scores.items() if score>= min_score] high_info_words |= set(bestwords) return high_info_words
cfdist = ConditionalFreqDist() # Indexador que representa a condicao da distribuicao cfdist['a'] # Armazena o tamanho das palavras que comeca com a letra a for token in corpus['SUBTOKENS']: if token['TEXT'][0] in ('A','a'): print token['TEXT'][0] cfdist['a'].inc(len(token['TEXT'])) # das palavras que comecam com 'a', quantas possuem 3 caracteres? cfdist['a'].freq(3) # lista as condicoes existentes cfdist.conditions() # In this example, we use a ConditionalFreqDist to examine # how the distribution of a word's length is affected by the word's # initial letter. from nltk.token import * from nltk.tokenizer import WhitespaceTokenizer from nltk.probability import ConditionalFreqDist from nltk.draw.plot import Plot corpus = Token(TEXT=open('dados/may2001_pdf.torto').read()) WhitespaceTokenizer().tokenize(corpus) cfdist = ConditionalFreqDist() #How does initial letter affect word length? for token in corpus['SUBTOKENS']:
def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. >>> from nltk.corpus import brown >>> from nltk.probability import LidstoneProbDist >>> est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) >>> lm = NgramModel(3, brown.words(categories='news'), estimator=est) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ... 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', ... 'primary', 'election', 'produced', '``', 'no', 'evidence', ... "''", 'that', 'any', 'irregularities', 'took', 'place', '.']) ... # doctest: +ELLIPSIS 0.5776... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert (isinstance(pad_left, bool)) assert (isinstance(pad_right, bool)) # make sure n is greater than zero, otherwise print it assert (n > 0), n # For explicitness save the check whether this is a unigram model self.is_unigram_model = (n == 1) # save the ngram order number self._n = n # save left and right padding self._lpad = ('', ) * (n - 1) if pad_left else () self._rpad = ('', ) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() # set read-only ngrams set (see property declaration below to reconfigure) self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], compat.string_types): train = [train] for sent in train: raw_ngrams = ngrams(sent, n, pad_left, pad_right) for ngram in raw_ngrams: self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] if not (context, token) in cfd: dict(cfd)[(context, token)] = 1 else: dict(cfd)[(context, token)] += 1 #cfd[(context, token)] += 1 self._probdist = estimator(cfd, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if not self.is_unigram_model: self._backoff = NgramModel(n - 1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 # this is the subset of words that we OBSERVED following # this context. # i.e. Count(word | context) > 0 for word in self._words_following(ctxt, cfd): total_observed_pr += self.prob(word, ctxt) # we also need the total (n-1)-gram probability of # words observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) assert (0 <= total_observed_pr <= 1), total_observed_pr # beta is the remaining probability weight after we factor out # the probability of observed words. # As a sanity check, both total_observed_pr and backoff_total_pr # must be GE 0, since probabilities are never negative beta = 1.0 - total_observed_pr # backoff total has to be less than one, otherwise we get # an error when we try subtracting it from 1 in the denominator assert (0 <= backoff_total_pr < 1), backoff_total_pr alpha_ctxt = beta / (1.0 - backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt
class MorphProbModel(): UNK_PROB = -99 def __init__(self, beam=1000, max_guess=20, rare_treshold=10, capitalization=True): self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._beam_size = beam self._use_capitalization = capitalization self._max_guess = max_guess self._treshold = rare_treshold self._unk = Guesser(10) self._analyzer = None self.cache = {} def set_analyzer(self, obj): self._analyzer = obj def train(self, data): C = False for sent in data: history = [('BOS', False), ('BOS', False)] for w, l, t in sent: # Ezt azért szedtem ki mert megeszik 4 giga memóriát ha marad # t = encode((w, l, t)) if self._use_capitalization and w[0].isupper(): C = True self._wd[w].inc(t) self._uni.inc((t, C)) self._bi[history[1]].inc((t, C)) self._tri[tuple(history)].inc((t, C)) history.append((t, C)) history.pop(0) C = False for word, fd in self._wd.iteritems(): for tag, count in fd.iteritems(): if count < self._treshold: self._unk.add_word(word.lower(), tag, count) self._unk.finalize() self._compute_lambda() def _compute_lambda(self): tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 for history in self._tri.conditions(): (h1, h2) = history for tag in self._tri[history].samples(): if self._uni[tag] == 1: continue c3 = self._safe_div((self._tri[history][tag] - 1), (self._tri[history].N() - 1)) c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] elif (c3 == c2) and (c3 > c1): tl2 += float(self._tri[history][tag]) / 2.0 tl3 += float(self._tri[history][tag]) / 2.0 elif (c2 == c1) and (c1 > c3): tl1 += float(self._tri[history][tag]) / 2.0 tl2 += float(self._tri[history][tag]) / 2.0 else: pass self._l1 = tl1 / (tl1 + tl2 + tl3) self._l2 = tl2 / (tl1 + tl2 + tl3) self._l3 = tl3 / (tl1 + tl2 + tl3) def _safe_div(self, v1, v2): if v2 == 0: return -1 else: return float(v1) / float(v2) def _transition_prob(self, t, C, history): p_uni = self._uni.freq((t, C)) p_bi = self._bi[history[-1]].freq((t, C)) p_tri = self._tri[tuple(history[-2:])].freq((t, C)) p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri if p == 0.0: return self.UNK_PROB return log(p, 2) def _known_lexical_prob(self, word, t, C): p = float(self._wd[word][t]) / float(self._uni[(t, C)]) return log(p, 2) def _analyze(self, word): tag_candidates = [] if word in self._wd.conditions(): tag_candidates = set(self._wd[word].samples()) else: analyses = map(itemgetter(1), self._analyzer.analyze(word)) guesses = self._unk.get_probs(word.lower()) guesses = map(itemgetter(0), sorted(guesses.iteritems(), reverse=True, key=itemgetter(1))[:self._max_guess]) tag_candidates = set(guesses) if analyses: tag_candidates &= set(analyses) if not tag_candidates: tag_candidates = set(guesses) return tag_candidates def _lexical_prob(self, word, t, C): if word in self._wd.conditions(): return self._known_lexical_prob(word, t, C) else: return self._unk.get_prob(word, t) def tag(self, sent, n=5): current_state = [(['BOS', 'BOS'], 0.0)] out = self._tagword(sent, current_state, n) return out def _tagword(self, sent, current_states, n=5): # A cache-sel elég gyors. Nem érdemes jobban vesződni vele. if sent == []: # yield ... return [(map(itemgetter(0), tag_seq[0][2:]), tag_seq[1]) for tag_seq in current_states[:n]] word = sent[0] sent = sent[1:] new_states = [] # Cache lookup sent_str = word + str(current_states) if sent_str in self.cache: return self._tagword(sent, self.cache[sent_str], n) C = False if self._use_capitalization and word[0].isupper(): C = True analyses = self._analyze(word) for (history, curr_sent_logprob) in current_states: logprobs = [] for t in analyses: p_t = self._transition_prob(t, C, history) p_l = self._lexical_prob(word, t, C) p = p_t + p_l logprobs.append(((t, C), p)) for (tag, logprob) in logprobs: new_states.append((history + [tag], curr_sent_logprob + logprob)) new_states.sort(reverse=True, key=itemgetter(1)) if len(new_states) > self._beam_size: new_states = new_states[:self._beam_size] # Cache store self.cache[sent_str] = new_states # yield new_states # self._tagword(sent, new_states, n) return self._tagword(sent, new_states, n)
n_word = get_positional_n(n, context) if n_word != '' and n > 0: condition = str(n) + "_ahead_" + re.sub(r'\_', '', n_word) cfd[condition][sense] += 1 elif n_word != '' and n < 0: condition = str(n) + "_behind_" + re.sub(r'\_', '', n_word) cfd[condition][sense] += 1 return cfd def get_positional_n(n, corpus): root_index = corpus.index(target) #position of line n_word_index = root_index + n #position of target word if len(corpus) > n_word_index and n_word_index >= 0: return corpus[n_word_index] else: return "" cfd = ConditionalFreqDist() Window = range(-5,5) for i in Window: if i != 0: cfd = add_condition(cfd, training_list, i) cpd = ConditionalProbDist(cfd,ELEProbDist,10) for cond in cfd.conditions():
class ngram(object): def __init__(self, load_from_disk=True): self._corpus = reuters.words() self._unigram_fd = FreqDist() self._bigram_cfd = ConditionalFreqDist() self._trigram_cfd = ConditionalFreqDist() self._quadgram_cfd = ConditionalFreqDist() self._unigram_pd = None self._bigram_cpd = None self._trigram_cpd = None self._quadgram_cpd = None if load_from_disk: self._load_models() else: self._train() def _train(self): print 'Training models...' start_time = time.time() prev_word = None prev_2_word = None prev_3_word = None for word in self._corpus: if word.isalpha(): self._unigram_fd[word] += 1 self._bigram_cfd[prev_word][word] += 1 self._trigram_cfd[tuple([prev_2_word, prev_word])][word] += 1 self._quadgram_cfd[tuple([prev_3_word, prev_2_word, prev_word])][word] += 1 prev_3_word = prev_2_word prev_2_word = prev_word prev_word = word self._unigram_pd = LaplaceProbDist(self._unigram_fd, bins=self._unigram_fd.N()) self._bigram_cpd = ConditionalProbDist(self._bigram_cfd, LaplaceProbDist, bins=len(self._bigram_cfd.conditions())) self._trigram_cpd = ConditionalProbDist(self._trigram_cfd, LaplaceProbDist, bins=len(self._trigram_cfd.conditions())) self._quadgram_cpd = ConditionalProbDist(self._quadgram_cfd, LaplaceProbDist, bins=len(self._quadgram_cfd.conditions())) print 'Models trained, took %s seconds' % (time.time() - start_time) self._save_models() def _save_models(self): print 'Saving Models to disk...' start_time = time.time() pickle.dump(self._unigram_pd, open('./unigram_pd.p', 'w')) pickle.dump(self._bigram_cpd , open('./bigram_cpd.p', 'w')) pickle.dump(self._trigram_cpd, open('./trigram_cpd.p', 'w')) pickle.dump(self._quadgram_cpd, open('./quadgram_cpd.p', 'w')) print 'Models saved, took %s seconds' % (time.time() - start_time) def _load_models(self): if not (isfile('./unigram_pd.p') and isfile('./bigram_cpd.p') and isfile('./trigram_cpd.p') and isfile('./quadgram_cpd.p')): self._train() return print 'Loading Models from disk...' start_time = time.time() self._unigram_pd = pickle.load(open('./unigram_pd.p', 'r')) self._bigram_cpd = pickle.load(open('./bigram_cpd.p', 'r')) self._trigram_cpd = pickle.load(open('./trigram_cpd.p', 'r')) self._quadgram_cpd = pickle.load(open('./quadgram_cpd.p', 'r')) print 'Models loaded, took %s seconds' % (time.time() - start_time) def next_word(self, context): context = word_tokenize(context) word = self._quadgram_cpd[tuple(context[-3:])].max() return word
class NgramModel(ModelI): """ A processing interface for assigning a probability to the next word. """ # add cutoff def __init__(self, n, docs=(), pad_left=False, pad_right=False, estimator=_estimator, cache_training=None, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. See model.doctest for more detailed testing >>> from nltk.corpus import brown >>> lm = NgramModel(3, brown.words(categories='news')) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(brown.words(categories='humor')) ... # doctest: +ELLIPSIS 12.0399... NB: If a ``bins`` parameter is given in the ``estimator_kwargs`` it will be ignored. The number of bins to use is the number of outcomes (tokens) encountered at each level of the backoff recursion and as such, the number must change each time. :param n: the order of the language model (ngram size) :type n: int :param docs: the training text. This needs to be a list, tuple, generator, or an iterable that yields such. :type docs: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ super(NgramModel, self).__init__() # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) # Check for bins argument if 'bins' in estimator_kwargs: warnings.warn('A value was provided for the `bins` parameter of ' '`estimator_kwargs`. This value will be overridden.' 'If you think you have a better idea, write your own ' 'darn model.') # Clear out the bins so we don't throw recursive warnings estimator_kwargs.pop('bins', None) # TODO: I never understood why this used an ngram to do the actual padding self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () self._pad_left = pad_left self._pad_right = pad_right # make sure n is greater than zero, otherwise print it assert (n > 0), n self._unigram_model = (n == 1) self._n = n # Declare all other fields self._backoff = None if not self._unigram_model: # FIXME: estimator_kwargs self._backoff = NgramModel(n - 1, [], pad_left, pad_right, estimator, **estimator_kwargs) self._backoff_alphas = None self._model = None # Process training self._ngrams = set() self.outcomes = set() self._cfd = ConditionalFreqDist() # =================== # Check Docs # =================== # I think it's important that the model be able to train on a one-use generator # so that it can train on corpora that don't fit in RAM. This requires some robust # type-checking though. What's below could use some improvement, but seems to work # for now. # TODO: test with CorpusView # Docs needs to be able to be a list, tuple, generator, or an iterable that yields # such (CorpusView?). # If given a list of strings instead of a list of lists, create enclosing list # NB: The Iterator type won't catch lists, or strings, but it will catch things # returned by functions in itertools if isinstance(docs, GeneratorType) or isinstance(docs, Iterator): nxt = docs.next() # Either it's a string or a list of string if isinstance(nxt, (basestring, int)): docs = [itertools.chain([nxt], docs)] elif isinstance(nxt, Sequence): # It should be a list of string... # FIXME: Handle generator here as well if isinstance(nxt[0], (basestring, int)): # So docs is a generator that yields sequences of str docs = itertools.chain([nxt], docs) else: raise TypeError("Training documents given to NgramModel are a generator " "that yields something other than a string or a list of " "string. %s" % docs) # could also just be a sting elif isinstance(docs, basestring): raise TypeError("Training documents given to NgramModel must be either a list " "of string or a list of lists of string. Or a generator that " "acts in the same way as one of the above. A string was found " "instead: %s" % docs) elif isinstance(docs, Sequence): # It's some kind of iterable with a __getitem__, not a generator # If it's empty, assume training will happen later if len(docs) == 0: pass elif isinstance(docs[0], (basestring, int)): # Make it into a list of lists docs = [docs] elif isinstance(docs[0], Sequence): # Check inner to make sure it's a string if not isinstance(docs[0][0], (basestring, int)): raise TypeError("Training documents given to NgramModel were neither a " "list of string nor a list of list of string: %s" % docs) # If it is a string everything is fine, nothing to worry about else: raise TypeError("Unsupported type supplied to NgramModel for training documents: %s" % docs) # Train the model for sent in docs: self.train(sent) # Build model and set the backoff parameters if len(self.outcomes) > 0: self._build_model(estimator, estimator_kwargs) # =================== # TRAINING # =================== # At every stage, in the backoff/recursion the number of bins for # the estimator should be equal to the total number of outcomes # (tokens) encountered while training. This means that it needs # to be recalculated at each level of the recursion. # NB: For the unigram case, this would be the actual vocabulary size def train(self, sent): """ Train this model and the backoff model on the given sentence. :param sent: A list of items to train on :type sent: list :return: None """ # FIXME: This may use extra memory, but because python 2.7 doesn't # support deepcopy for generators, I'm not sure what else to do... if isinstance(sent, GeneratorType) or isinstance(sent, Iterator): s1, s2 = itertools.tee(sent, 2) self._train_one(s1) if self._backoff is not None: self._backoff.train(s2) else: self._train_one(sent) if self._backoff is not None: self._backoff.train(sent) # FIXME: Discard cfd after training? # Should check if the probdist keeps a reference to it def _train_one(self, sent): """Train the model on a sequence""" # print "TRAINING!" for ngram in ngrams(sent, self._n, self._pad_left, self._pad_right, left_pad_symbol=self._lpad, right_pad_symbol=self._rpad): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] self._cfd[context][token] += 1 self.outcomes.add(token) # =================== # CREATE MODEL # =================== # NB: Even if the number of bins is explicitly passed, we should use the number # of word types encountered during training as the bins value. # If right padding is on, this includes the padding symbol. # # There is a good reason for this! If the number of bins isn't set the # ConditionalProbDist will choose from a different total number of possible # outcomes for each condition and the NgramModel won't give probability # estimates that sum to 1. def _build_model(self, estimator, estimator_kwargs): """ Construct the ``ConditionalProbDist`` used to estimate probabilities. This should only be called after the model has been trained. If additional training is performed, this should be called again. :param estimator: A callable that returns something that extends ProbDistI. Used to map the frequency of a condition to its probability. The only estimator that currently work are ``LidstoneProbDist`` and ``LaplaceProbDist``. :param estimator_kwargs: Additional arguments to pass to the estimator. If ``bins`` is in here it will be overridden. :return: None """ n_outcomes = len(self.outcomes) if n_outcomes <= 0: raise RuntimeError("NgramModel can't build a model without training input!") estimator_kwargs['bins'] = n_outcomes # Create the probability model self._model = ConditionalProbDist(self._cfd, estimator, **estimator_kwargs) # Clear out the bins so we don't throw recursive warnings estimator_kwargs.pop('bins', None) # Build backoff model and get backoff parameters if not self._unigram_model: self._backoff._build_model(estimator, estimator_kwargs) self._set_backoff_params() # =================== # SET BACKOFF PARAMS # =================== def _set_backoff_params(self): """ Sets the alphas for the backoff models used to calculate the probability for unseen ngrams. :return: None """ # Construct parameters for if not self._unigram_model: self._backoff_alphas = dict() # For each condition (or context) for ctxt in self._cfd.conditions(): prdist = self._model[ctxt] # prob dist for this context backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 for word in self._cfd[ctxt]: # This is the subset of words that we OBSERVED # following this context total_observed_pr += prdist.prob(word) # We normalize it by the total (n-1)-gram probability of # words that were observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) assert (0 < total_observed_pr <= 1), total_observed_pr # beta is the remaining probability weight after we factor out # the probability of observed words beta = 1.0 - total_observed_pr # backoff total has to be less than one, otherwise we get # ZeroDivision error when we try subtracting it from 1 below assert (0 < backoff_total_pr < 1), backoff_total_pr alpha_ctxt = beta / (1.0 - backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt # ================== # API Methods # ================== # This is a new method (not in original nltk model) def prob_seq(self, seq): """ Evaluate the probability of a sequence (list of tokens). The probability of a sequence is the product of the probabilities of all of the ngrams in that sequence. Because these probabilities can be very small underflow errors are common with long sequences. In order to avoid underflows, a common approach is to do all of the calculations in (negative) log-space and take advantage of the properties of logs to transform the calculation into a sum. That is the approach used here. :param seq: A list of tokens representing a document/sentence/etc. :type seq: list(str) :return: The negative log probabilitiy of the given sequence :rtype: float """ prob = 0.0 for ngram in ngrams(seq, self._n, self._pad_left, self._pad_right, left_pad_symbol=self._lpad, right_pad_symbol=self._rpad): context = tuple(ngram[:-1]) token = ngram[-1] prob += self.logprob(token, context) return prob def prob(self, word, context=()): """ Evaluate the probability of this word in this context using Katz Backoff. :param word: the word to get the probability of :type word: str :param context: the context the word is in :type context: list(str) """ context = tuple(context) if (context + (word,) in self._ngrams) or self._unigram_model: return self[context].prob(word) else: return self._alpha(context) * self._backoff.prob(word, context[1:]) # Updated _alpha function, discarded the _beta function def _alpha(self, context): """Get the backoff alpha value for the given context """ error_message = "Alphas and backoff are not defined for unigram models" assert not self._unigram_model, error_message if context in self._backoff_alphas: return self._backoff_alphas[context] else: return 1 def logprob(self, word, context=()): """ Evaluate the (negative) log probability of this word in this context. :param word: the word to get the probability of :type word: str :param context: the context the word is in :type context: list(str) """ return -math.log(self.prob(word, context), 2) def choose_random_word(self, context): """ Randomly select a word that is likely to appear in this context. :param context: the context the word is in :type context: list(str)|tuple(str) """ return self.generate(1, context)[-1] # NB, this will always start with same word if the model # was trained on a single text def generate(self, num_words, context=()): """ Generate random text based on the language model. :param num_words: number of words to generate :type num_words: int :param context: initial words in generated string :type context: list(str)|tuple(str) """ text = list(context) for i in range(num_words): text.append(self._generate_one(text)) return text def _generate_one(self, context): context = (self._lpad + tuple(context))[-self._n + 1:] if context in self: return self[context].generate() elif self._n > 1: return self._backoff._generate_one(context[1:]) else: return '.' def entropy(self, text): """ Calculate the approximate cross-entropy of the n-gram model for a given evaluation text. This is the average log probability of each word in the text. :param text: words to use for evaluation :type text: list(str) """ H = 0.0 # entropy is conventionally denoted by "H" text = list(self._lpad) + text + list(self._rpad) for i in xrange(self._n - 1, len(text)): context = tuple(text[(i - self._n + 1):i]) token = text[i] H += self.logprob(token, context) return H / float(len(text) - (self._n - 1)) def perplexity(self, text): """ Calculates the perplexity of the given text. This is simply 2 ** cross-entropy for the text. :param text: words to calculate perplexity of :type text: list(str) """ return math.pow(2.0, self.entropy(text)) def __contains__(self, item): if not isinstance(item, tuple): item = (item,) return item in self._model def __getitem__(self, item): if not isinstance(item, tuple): item = (item,) return self._model[item] def __repr__(self): return '<NgramModel with %d %d-grams>' % (len(self._ngrams), self._n)
class MyNgramModel(NgramModel): """ A processing interface for assigning a probability to the next word. """ def __init__(self, n, train, pad_left=True, pad_right=False,estimator=None, *estimator_args, **estimator_kwargs): super(MyNgramModel,self).__init__(n,train,pad_left,pad_right,estimator,*estimator_args, **estimator_kwargs) assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) self._n = n self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator self._cfd = ConditionalFreqDist() self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], basestring): train = [train] for sent in train: for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] self._cfd[context].inc(token) if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(self._cfd, estimator, len(self._cfd)) else: self._model = ConditionalProbDist(self._cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models self._backoff = None if n > 1: self._backoff = MyNgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) if self._backoff is not None: self._backoff_alphas = dict() # For each condition (or context) for ctxt in self._cfd.conditions(): pd = self._model[ctxt] # prob dist for this context backoff_ctxt = ctxt[1:] backoff_total_pr = 0 total_observed_pr = 0 for word in self._cfd[ctxt].keys(): # this is the subset of words that we OBSERVED backoff_total_pr += self._backoff.prob(word,backoff_ctxt) total_observed_pr += pd.prob(word) assert total_observed_pr <= 1 and total_observed_pr > 0 assert backoff_total_pr <= 1 and backoff_total_pr > 0 alpha_ctxt = (1.0-total_observed_pr) / (1.0-backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt # Updated _alpha function, discarded the _beta function def _alpha(self, tokens): if tokens in self._backoff_alphas: return self._backoff_alphas[tokens] else: return 1
### get the (token,tag) pair for each tagged sentence i = 1 for sentence in brown.tagged_sents(): for (token, tag) in sentence: if i < 6: print(token, tag) fd.inc(tag) cfd[token].inc(tag) i += 1 ### the most frequent tag: print fd.max() wordbins = [] for token in cfd.conditions(): wordbins.append((cfd[token].B(), token)) ### sort tuples by number of unique tags wordbins.sort(reverse=True) print wordbins[0:3] ### masculine pronouns male = ['he', 'his', 'him', 'himself'] female = ['she', 'hers', 'her', 'herself'] n_male, n_female = 0, 0 for m in male: n_male += cfd[m].N()
def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. See model.doctest for more detailed testing >>> from nltk.corpus import brown >>> lm = NgramModel(3, brown.words(categories='news')) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(brown.words(categories='humor')) ... # doctest: +ELLIPSIS 12.0399... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert (isinstance(pad_left, bool)) assert (isinstance(pad_right, bool)) self._lpad = ('', ) * (n - 1) if pad_left else () self._rpad = ('', ) * (n - 1) if pad_right else () # make sure n is greater than zero, otherwise print it assert (n > 0), n # For explicitness save the check whether this is a unigram model self.is_unigram_model = (n == 1) # save the ngram order number self._n = n # save left and right padding self._lpad = ('', ) * (n - 1) if pad_left else () self._rpad = ('', ) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() # set read-only ngrams set (see property declaration below to reconfigure) self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], compat.string_types): train = [train] # we need to keep track of the number of word types we encounter vocabulary = set() for sent in train: raw_ngrams = ngrams(sent, n, pad_left, pad_right, pad_symbol='') for ngram in raw_ngrams: self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token] += 1 vocabulary.add(token) # Unless number of bins is explicitly passed, we should use the number # of word types encountered during training as the bins value. # If right padding is on, this includes the padding symbol. if 'bins' not in estimator_kwargs: estimator_kwargs['bins'] = len(vocabulary) self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs) # recursively construct the lower-order models if not self.is_unigram_model: self._backoff = NgramModel(n - 1, train, pad_left, pad_right, estimator, **estimator_kwargs) self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 # this is the subset of words that we OBSERVED following # this context. # i.e. Count(word | context) > 0 for words in self._words_following(ctxt, cfd): # so, _words_following as fixed gives back a whole list now... for word in words: total_observed_pr += self.prob(word, ctxt) # we also need the total (n-1)-gram probability of # words observed in this n-gram context backoff_total_pr += self._backoff.prob( word, backoff_ctxt) assert (0 <= total_observed_pr <= 1), total_observed_pr # beta is the remaining probability weight after we factor out # the probability of observed words. # As a sanity check, both total_observed_pr and backoff_total_pr # must be GE 0, since probabilities are never negative beta = 1.0 - total_observed_pr # backoff total has to be less than one, otherwise we get # an error when we try subtracting it from 1 in the denominator assert (0 <= backoff_total_pr < 1), backoff_total_pr alpha_ctxt = beta / (1.0 - backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt
def __init__(self, n, train, k=5, v=None, liveDangerously=False, quiet=False): """ Creates an Katz-threshholded Ngram language model to capture patterns in n consecutive words of training text. Uses the KGoodTuringProbDist to estimate the conditional and unigram probabilities, to provide coverage of Ngrams not seen during training. @param n: the order of the language model (ngram size) @type n: C{int} @param train: the training text @type train: C{list} of C{string} @param k: The threshhold above which counts are assumed to be reliable. Defaults to 5. @type k: C{Int} @param v: The number of unseens of degree 1. Defaults to the number of types in the training set @type v: C{Int} @param liveDangerously: If False, for each model check that the total probability mass after all adjustments is close to 1. Defaults to False. @type liveDangerously: C{Boolean} @param quiet: Various information will be printed during model construction unless this is True. Defaults to False. @type quiet: C{Boolean} """ self._n = n self._N = 1 + len(train) - n fd = FreqDist(train) if v is None: v = fd.B() print(('v', v)) if n == 1: # Treat this case specially self._model = KGoodTuringProbDist(fd, k, v, liveDangerously, ()) if not quiet: print("%s entries for %s tokens at degree 1, %s" % (len(fd), fd.N(), self._model.status)) else: def estimator(fdist, ctxt): return KGoodTuringProbDist(fdist, k, v, liveDangerously, ctxt) cfd = ConditionalFreqDist() for ngram in ingrams(train, n): # self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context].inc(token) self._model = ConditionalProbDist(cfd, estimator, True) if not quiet: statuses = {'normal': 0, 'bigSkewed': 0, 'weak': 0, LowHacked: 0} for ctx in cfd.conditions(): statuses[self[ctx].status] += 1 print("%s conditions at degree %s" % (len(cfd.conditions()), n)) for s in list(statuses.keys()): print(" %s %6d" % (s, statuses[s])) # recursively construct the lower-order models self._backoff = KBNgramModel(n - 1, train, k, v, liveDangerously)
def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. See model.doctest for more detailed testing >>> from nltk.corpus import brown >>> lm = NgramModel(3, brown.words(categories='news')) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(brown.words(categories='humor')) ... # doctest: +ELLIPSIS 12.0399... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () # make sure n is greater than zero, otherwise print it assert (n > 0), n self._unigram_model = (n == 1) self._n = n if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() self._ngrams = set() # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], compat.string_types): train = [train] # we need to keep track of the number of word types we encounter vocabulary = set() for sent in train: for ngram in ngrams(sent, n, pad_left, pad_right, pad_symbol=''): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token] += 1 vocabulary.add(token) # Unless number of bins is explicitly passed, we should use the number # of word types encountered during training as the bins value. # If right padding is on, this includes the padding symbol. if 'bins' not in estimator_kwargs: estimator_kwargs['bins'] = len(vocabulary) self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs) # recursively construct the lower-order models if not self._unigram_model: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, **estimator_kwargs) self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): prdist = self._model[ctxt] # prob dist for this context backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 for word in cfd[ctxt]: # this is the subset of words that we OBSERVED # following this context total_observed_pr += prdist.prob(word) # we normalize it by the total (n-1)-gram probability of # words that were observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) assert (0.0 < total_observed_pr < 1.1), total_observed_pr # beta is the remaining probability weight after we factor out # the probability of observed words beta = 1.0 - total_observed_pr # backoff total has to be less than one, otherwise we get # ZeroDivision error when we try subtracting it from 1 below if backoff_total_pr >= 1.0: backoff_total_pr = 0.99 assert (0 < backoff_total_pr < 1.0), backoff_total_pr alpha_ctxt = beta / (1.0 - backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt
class TnT(TaggerI): ''' TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. To speed up and get more precision, we can use log addition to instead multiplication, specifically: argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + log(P(t_T+1|t_T)) The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. ''' def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): ''' Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) ''' # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = [('BOS',False), ('BOS',False)] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C=True self._wd[w].inc(t) self._uni.inc((t,C)) self._bi[history[1]].inc((t,C)) self._tri[tuple(history)].inc((t,C)) history.append((t,C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t].inc('EOS') # compute lambda values from the trained frequency distributions self._compute_lambda() #(debugging -- ignore or delete me) #print "lambdas" #print i, self._l1, i, self._l2, i, self._l3 def _compute_lambda(self): ''' creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) ''' # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].samples(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1)) c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1)) c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += float(self._tri[history][tag]) /2.0 tl3 += float(self._tri[history][tag]) /2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += float(self._tri[history][tag]) /2.0 tl2 += float(self._tri[history][tag]) /2.0 # otherwise there might be a problem # eg: all values = 0 else: #print "Problem", c1, c2 ,c3 pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1+tl2+tl3) self._l2 = tl2 / (tl1+tl2+tl3) self._l3 = tl3 / (tl1+tl2+tl3) def _safe_div(self, v1, v2): ''' Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 ''' if v2 == 0: return -1 else: return float(v1) / float(v2) def tagdata(self, data): ''' Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples ''' res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): ''' Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples ''' current_state = [(['BOS', 'BOS'], 0.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t,C) = tags[i+2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): ''' :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the log probability associated with each tag combination :type current_states : [([tag, ], logprob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag ''' # if this word marks the end of the sentance, # return the most probable tag if sent == []: (h, logp) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initalise the flag for this word C = False if self._C and word[0].isupper(): C=True # if word is known # compute the set of possible tags # and their associated log probabilities if word in self._wd.conditions(): self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].samples(): p_uni = self._uni.freq((t,C)) p_bi = self._bi[history[-1]].freq((t,C)) p_tri = self._tri[tuple(history[-2:])].freq((t,C)) p_wd = float(self._wd[word][t])/float(self._uni[(t,C)]) p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri p2 = log(p, 2) + log(p_wd, 2) logprobs.append(((t,C), p2)) # compute the result of appending each tag to this history for (tag, logprob) in logprobs: new_states.append((history + [tag], curr_sent_logprob + logprob)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ('Unk',C) # otherwise apply the unknown word tagger else : [(_w, t)] = list(self._unk.tag([word])) tag = (t,C) for (history, logprob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by log prob # set is now ordered greatest to least log probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[:self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states)
def __init__(self, n, train, pad_left=True, pad_right=False, estimator=None, **estimator_kwargs): """ Create an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. See model.doctest for more detailed testing >>> from nltk.corpus import brown >>> lm = NgramModel(3, brown.words(categories='news')) >>> lm <NgramModel with 91603 3-grams> >>> lm._backoff <NgramModel with 62888 2-grams> >>> lm.entropy(brown.words(categories='humor')) ... # doctest: +ELLIPSIS 12.0399... :param n: the order of the language model (ngram size) :type n: int :param train: the training text :type train: list(str) or list(list(str)) :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of empty strings :type pad_left: bool :param pad_right: whether to pad the right of each sentence with an (n-1)-gram of empty strings :type pad_right: bool :param estimator: a function for generating a probability distribution :type estimator: a function that takes a ConditionalFreqDist and returns a ConditionalProbDist :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () # make sure n is greater than zero, otherwise print it assert (n > 0), n # For explicitness save the check whether this is a unigram model self.is_unigram_model = (n == 1) # save the ngram order number self._n = n # save left and right padding self._lpad = ('',) * (n - 1) if pad_left else () self._rpad = ('',) * (n - 1) if pad_right else () if estimator is None: estimator = _estimator cfd = ConditionalFreqDist() # set read-only ngrams set (see property declaration below to reconfigure) self._ngrams = set() ''' # If given a list of strings instead of a list of lists, create enclosing list if (train is not None) and isinstance(train[0], compat.string_types): train = [train] ''' # we need to keep track of the number of word types we encounter vocabulary = set() count = 0 #for review in train: for review in read_reviews(train): count += 1 if count % 10000 == 0: print str(count) + ' reviews processed' #for testing with small training set #if count > 100000: # break #newly added, each element is dict of each review review_text = review['text'] #separate into tokens, lowercase tokens = word_tokenize(review_text) tokens = [w.lower() for w in tokens] #updated for new nltk api raw_ngrams = ngrams(tokens, n, pad_left, pad_right, left_pad_symbol='', right_pad_symbol='...EOR...') for ngram in raw_ngrams: self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token] += 1 vocabulary.add(token) # Unless number of bins is explicitly passed, we should use the number # of word types encountered during training as the bins value. # If right padding is on, this includes the padding symbol. if 'bins' not in estimator_kwargs: estimator_kwargs['bins'] = len(vocabulary) * 2 self._model = ConditionalProbDist(cfd, estimator, **estimator_kwargs) self._probdist = self._model # recursively construct the lower-order models if not self.is_unigram_model: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, **estimator_kwargs) self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 # this is the subset of words that we OBSERVED following # this context. # i.e. Count(word | context) > 0 for word in self._words_following(ctxt, cfd): total_observed_pr += self.prob(word, ctxt) # we also need the total (n-1)-gram probability of # words observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) assert (0 <= total_observed_pr <= 1), total_observed_pr # beta is the remaining probability weight after we factor out # the probability of observed words. # As a sanity check, both total_observed_pr and backoff_total_pr # must be GE 0, since probabilities are never negative beta = 1.0 - total_observed_pr # backoff total has to be less than one, otherwise we get # an error when we try subtracting it from 1 in the denominator assert (0 <= backoff_total_pr < 1), backoff_total_pr alpha_ctxt = beta / (1.0 - backoff_total_pr) self._backoff_alphas[ctxt] = alpha_ctxt
from nltk.probability import FreqDist from nltk.probability import ConditionalFreqDist word_fd = FreqDist() label_word_fd = ConditionalFreqDist() testNegWords = movie_reviews.words(categories=['pos']) testPosWords = movie_reviews.words(categories=['neg']) for word in testNegWords: word_fd[word.lower()]+=1 label_word_fd['neg'][word.lower()]+=1 for word in testPosWords: word_fd[word.lower()]+=1 label_word_fd['pos'][word.lower()]+=1 print(word_fd.N(),word_fd.B(),word_fd.most_common(20)) print(label_word_fd.N(),label_word_fd.conditions(),label_word_fd.items()) print(label_word_fd['pos'].N(),label_word_fd['neg'].N()) # In[ ]: # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N() # w1 ~w1 # ------ ------ # w2 | n_ii | n_oi | = n_xi # ------ ------ # ~w2 | n_io | n_oo | # ------ ------
class TnT(TaggerI): """ TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. To speed up and get more precision, we can use log addition to instead multiplication, specifically: argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + log(P(t_T+1|t_T)) The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. """ def __init__(self, unk=None, Trained=False, N=1000, C=False): """ Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk: TaggerI :param Trained: Indication that the POS tagger is trained or not :type Trained: bool :param N: Beam search degree (see above) :type N: int :param C: Capitalization flag :type C: bool Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger """ self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): """ Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) """ # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = [("BOS", False), ("BOS", False)] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C = True self._wd[w][t] += 1 self._uni[(t, C)] += 1 self._bi[history[1]][(t, C)] += 1 self._tri[tuple(history)][(t, C)] += 1 history.append((t, C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t]["EOS"] += 1 # compute lambda values from the trained frequency distributions self._compute_lambda() def _compute_lambda(self): """ creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) """ # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].keys(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div((self._tri[history][tag] - 1), (self._tri[history].N() - 1)) c2 = self._safe_div((self._bi[h2][tag] - 1), (self._bi[h2].N() - 1)) c1 = self._safe_div((self._uni[tag] - 1), (self._uni.N() - 1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += self._tri[history][tag] / 2.0 tl3 += self._tri[history][tag] / 2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += self._tri[history][tag] / 2.0 tl2 += self._tri[history][tag] / 2.0 # otherwise there might be a problem # eg: all values = 0 else: pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1 + tl2 + tl3) self._l2 = tl2 / (tl1 + tl2 + tl3) self._l3 = tl3 / (tl1 + tl2 + tl3) def _safe_div(self, v1, v2): """ Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 """ if v2 == 0: return -1 else: return v1 / v2 def tagdata(self, data): """ Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples """ res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): """ Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples """ current_state = [(["BOS", "BOS"], 0.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t, C) = tags[i + 2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): """ :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the log probability associated with each tag combination :type current_states : [([tag, ], logprob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag """ # if this word marks the end of the sentence, # return the most probable tag if sent == []: (h, logp) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initialise the flag for this word C = False if self._C and word[0].isupper(): C = True # if word is known # compute the set of possible tags # and their associated log probabilities if word in self._wd: self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].keys(): tC = (t, C) p_uni = self._uni.freq(tC) p_bi = self._bi[history[-1]].freq(tC) p_tri = self._tri[tuple(history[-2:])].freq(tC) p_wd = self._wd[word][t] / self._uni[tC] p = self._l1 * p_uni + self._l2 * p_bi + self._l3 * p_tri p2 = log(p, 2) + log(p_wd, 2) # compute the result of appending each tag to this history new_states.append((history + [tC], curr_sent_logprob + p2)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ("Unk", C) # otherwise apply the unknown word tagger else: [(_w, t)] = list(self._unk.tag([word])) tag = (t, C) for (history, logprob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by log prob # set is now ordered greatest to least log probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[:self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states)
word_fd = FreqDist() label_word_fd = ConditionalFreqDist() # for (feats, label) in fvecs: #print label for key in feats: #print key if feats[key]: word_fd.inc(key) #print word_fd label_word_fd[label].inc(key) #print label_word_fd[label] # ##print word_fd['positive'] ##print label_word_fd print label_word_fd.conditions() cls_set=label_word_fd.conditions() # # pos_word_count = label_word_fd['positive'].N() print "positive word count: " + str(pos_word_count) neg_word_count = label_word_fd['negative'].N() print "negative word count: " + str(neg_word_count) total_word_count = pos_word_count + neg_word_count print "totl word count: " + str(total_word_count) # feature_scores = {} for feature, freq in word_fd.iteritems(): #print feature, freq pos_score = BigramAssocMeasures.chi_sq(label_word_fd['positive'][feature],
word = unicode(word) cfd[prev_word][word]+=1 global_fd[word] += 1 prev_word = word except: print "falhou um link..." print "terminou distrib probabilidades" print "vai construir as listas com o formato pro SQLite" global_frequencies = [] for word in sorted(global_fd.keys()): global_frequencies.append((word, global_fd[word])) conditional_frequencies = [] for condition in sorted(cfd.conditions()): for word in sorted(cfd[condition].keys()): if condition: conditional_frequencies.append((condition, word, cfd[condition][word])) print "vai comecar a parte do banco de dados" con = lite.connect("words-pt.db") with con: cur = con.cursor() cur.execute("DROP TABLE IF EXISTS _1_gram") cur.execute("CREATE TABLE _1_gram(word TEXT, count INT)") cur.executemany("INSERT INTO _1_gram VALUES(?, ?)", tuple(global_frequencies)) cur.execute("DROP TABLE IF EXISTS _2_gram")
def __init__(self, n, train, pad_left=False, pad_right=False, estimator=None, *estimator_args, **estimator_kwargs): """ Creates an ngram language model to capture patterns in n consecutive words of training text. An estimator smooths the probabilities derived from the text and may allow generation of ngrams not seen during training. :param n: the order of the language model (ngram size) :type n: C{int} :param train: the training text :type train: C{iterable} of C{string} or C{iterable} of C{iterable} of C{string} :param estimator: a function for generating a probability distribution---defaults to MLEProbDist :type estimator: a function that takes a C{ConditionalFreqDist} and returns a C{ConditionalProbDist} :param pad_left: whether to pad the left of each sentence with an (n-1)-gram of <s> :type pad_left: bool :param pad_right: whether to pad the right of each sentence with </s> :type pad_right: bool :param estimator_args: Extra arguments for estimator. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. Note: For backward-compatibility, if no arguments are specified, the number of bins in the underlying ConditionalFreqDist are passed to the estimator as an argument. :type estimator_args: (any) :param estimator_kwargs: Extra keyword arguments for the estimator :type estimator_kwargs: (any) """ # protection from cryptic behavior for calling programs # that use the pre-2.0.2 interface assert(isinstance(pad_left, bool)) assert(isinstance(pad_right, bool)) # make sure n is greater than zero, otherwise print it assert (n > 0), n # For explicitness save the check whether this is a unigram model self.is_unigram_model = (n == 1) # save the ngram order number self._n = n # save left and right padding self._lpad = ('<s>',) * (n - 1) if pad_left else () # Need _rpad even for unigrams or padded entropy will give # wrong answer because '</s>' will be treated as unseen... self._rpad = ('</s>',) if pad_right else () self._padLen = len(self._lpad)+len(self._rpad) self._N=0 delta = 1+self._padLen-n # len(sent)+delta == ngrams in sent if estimator is None: assert (estimator_args is ()) and (estimator_kwargs=={}),\ "estimator_args (%s) or _kwargs supplied (%s), but no estimator"%(estimator_args,estimator_kwargs) estimator = lambda fdist, bins: MLEProbDist(fdist) # Given backoff, a generator isn't acceptable if not isinstance(train,collections.abc.Sequence): train=list(train) self._W = len(train) # Coerce to list of list -- note that this means to train charGrams, # requires exploding the words ahead of time if train is not None: if isinstance(train[0], compat.string_types): train = [train] self._W=1 elif not isinstance(train[0],collections.abc.Sequence): # if you mix strings and generators, you have only yourself # to blame! for i in range(len(train)): train[i]=list(train[i]) if n == 1: if pad_right: sents=(chain(s,self._rpad) for s in train) else: sents=train fd=FreqDist() for s in sents: fd.update(s) if not estimator_args and not estimator_kwargs: self._model = estimator(fd,fd.B()) else: self._model = estimator(fd,fd.B(), *estimator_args, **estimator_kwargs) self._N=fd.N() else: cfd = ConditionalFreqDist() self._ngrams = set() for sent in train: self._N+=len(sent)+delta for ngram in ingrams(chain(self._lpad, sent, self._rpad), n): self._ngrams.add(ngram) context = tuple(ngram[:-1]) token = ngram[-1] cfd[context][token]+=1 if not estimator_args and not estimator_kwargs: self._model = ConditionalProbDist(cfd, estimator, len(cfd)) else: self._model = ConditionalProbDist(cfd, estimator, *estimator_args, **estimator_kwargs) # recursively construct the lower-order models if not self.is_unigram_model: self._backoff = NgramModel(n-1, train, pad_left, pad_right, estimator, *estimator_args, **estimator_kwargs) # Code below here in this method, and the _words_following and _alpha method, are from # http://www.nltk.org/_modules/nltk/model/ngram.html "Last updated on Feb 26, 2015" self._backoff_alphas = dict() # For each condition (or context) for ctxt in cfd.conditions(): backoff_ctxt = ctxt[1:] backoff_total_pr = 0.0 total_observed_pr = 0.0 # this is the subset of words that we OBSERVED following # this context. # i.e. Count(word | context) > 0 for word in self._words_following(ctxt, cfd): total_observed_pr += self.prob(word, ctxt) # we also need the total (n-1)-gram probability of # words observed in this n-gram context backoff_total_pr += self._backoff.prob(word, backoff_ctxt) if isclose(total_observed_pr,1.0): total_observed_pr=1.0 else: assert 0.0 <= total_observed_pr <= 1.0,\ "sum of probs for %s out of bounds: %.10g"%(ctxt,total_observed_pr) # beta is the remaining probability weight after we factor out # the probability of observed words. # As a sanity check, both total_observed_pr and backoff_total_pr # must be GE 0, since probabilities are never negative beta = 1.0 - total_observed_pr if beta!=0.0: assert (0.0 <= backoff_total_pr < 1.0), \ "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) alpha_ctxt = beta / (1.0 - backoff_total_pr) else: assert ((0.0 <= backoff_total_pr < 1.0) or isclose(1.0,backoff_total_pr)), \ "sum of backoff probs for %s out of bounds: %s"%(ctxt,backoff_total_pr) alpha_ctxt = 0.0 self._backoff_alphas[ctxt] = alpha_ctxt