def laplace_stuff(): sent = "am ate ate apple am x." sent_tokenized = word_tokenize(sent) freq_dist = FreqDist(word.lower() for word in word_tokenize(sent)) print(freq_dist.items()) lap = probability.LaplaceProbDist(freq_dist) print(lap.generate()) print(lap.prob("am")) print("Finished freq dist, Starting Cond dist") # Cond Probabilty cond_dist = ConditionalFreqDist() context = None tokens = sent_tokenized # The type of the preceeding word for token in tokens: outcome = token cond_dist[context] = (outcome) context = token print(cond_dist["am"]) print(cond_dist.items())
class ContextIndex(object): """ A bidirectional index between words and their 'contexts' in a text. The context of a word is usually defined to be the words that occur in a fixed window around the word; but other definitions may also be used by providing a custom context function. """ @staticmethod def _default_context(tokens, i): """One left token and one right token, normalized to lowercase""" left = (tokens[i - 1].lower() if i != 0 else '*START*') right = (tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*') return (left, right) def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD( (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)) self._context_to_words = CFD( (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)) def tokens(self): """ :rtype: list(str) :return: The document that this context index was created from. """ return self._tokens def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores def similar_words(self, word, n=20): scores = defaultdict(int) for c in self._word_to_contexts[self._key(word)]: for w in self._context_to_words[c]: if w != word: scores[w] += self._context_to_words[c][ word] * self._context_to_words[c][w] return sorted(scores, key=scores.get, reverse=True)[:n] def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist(c for w in words for c in self._word_to_contexts[w] if c in common) return fd
class ContextIndex(object): """ A bidirectional index between words and their 'contexts' in a text. The context of a word is usually defined to be the words that occur in a fixed window around the word; but other definitions may also be used by providing a custom context function. """ @staticmethod def _default_context(tokens, i): """One left token and one right token, normalized to lowercase""" left = tokens[i - 1].lower() if i != 0 else '*START*' right = tokens[i + 1].lower() if i != len(tokens) - 1 else '*END*' return (left, right) def __init__(self, tokens, context_func=None, filter=None, key=lambda x: x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD( (self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens) ) self._context_to_words = CFD( (self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens) ) def tokens(self): """ :rtype: list(str) :return: The document that this context index was created from. """ return self._tokens def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores def similar_words(self, word, n=20): scores = defaultdict(int) for c in self._word_to_contexts[self._key(word)]: for w in self._context_to_words[c]: if w != word: scores[w] += ( self._context_to_words[c][word] * self._context_to_words[c][w] ) return sorted(scores, key=scores.get, reverse=True)[:n] def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist( c for w in words for c in self._word_to_contexts[w] if c in common ) return fd
from nltk.probability import FreqDist from nltk.probability import ConditionalFreqDist word_fd = FreqDist() label_word_fd = ConditionalFreqDist() testNegWords = movie_reviews.words(categories=['pos']) testPosWords = movie_reviews.words(categories=['neg']) for word in testNegWords: word_fd[word.lower()]+=1 label_word_fd['neg'][word.lower()]+=1 for word in testPosWords: word_fd[word.lower()]+=1 label_word_fd['pos'][word.lower()]+=1 print(word_fd.N(),word_fd.B(),word_fd.most_common(20)) print(label_word_fd.N(),label_word_fd.conditions(),label_word_fd.items()) print(label_word_fd['pos'].N(),label_word_fd['neg'].N()) # In[ ]: # n_ii = label_word_fd[label][word] # n_ix = word_fd[word] # n_xi = label_word_fd[label].N() # n_xx = label_word_fd.N() # w1 ~w1 # ------ ------ # w2 | n_ii | n_oi | = n_xi # ------ ------ # ~w2 | n_io | n_oo | # ------ ------
A = set(allwords) longwords = [w for w in A if len(w) > 12] #单词长度>12的所有单词 print(sorted(longwords)) from nltk.probability import FreqDist, ConditionalFreqDist """ FreqDist: 创建一个所给数据的频率分布 B(): 不同单词的个数 N(): 所有单词的个数 tabulate(20): 把前20组数据以表格的形式显示出来 fd2.plot(20,cumulative=True): 参数cumulative 对数据进行累计 """ fd2 = FreqDist([sx.lower() for sx in allwords if sx.isalpha()]) print("不同单词的个数:%d" % fd2.B()) print("所有单词的个数:%d" % fd2.N()) fd2.tabulate(20) #把前20组数据 以表格的形式显示出来 fd2.plot(20) fd2.plot(20, cumulative=True) """ freq('the') #单词the出现的频率 ConditionalFreqDist( ): 条件频率统计的函数,研究类别之间的系统性的差异 """ from nltk.corpus import inaugural print(fd2.freq('the')) #单词the出现的频率 cfd = ConditionalFreqDist((fileid, len(w)) for fileid in inaugural.fileids() for w in inaugural.word(fileid) if fileid > '1980' and fileid < '2010') print(cfd.items()) cfd.plot()