def fun_2_1(): from nltk.util import ngrams from nltk.corpus import alpino # Unigram (一元语法)代表单个标识符。以下代码用于为 Alpino 语料库生成 unigrams print alpino.words() unigrams = ngrams(alpino.words(), 1) for i in unigrams: # print i pass # 考虑另一个有关从 alpino 语料库生成 quadgrams 或 fourgrams (四元语法)的例子 unigrams = ngrams(alpino.words(), 4) for i in unigrams: # print i pass # bigram(二元语法)指的是一对标识符。为了在文本中找到 bigrams,首先需要搜索 # 小写单词,把文本创建为小写单词列表后,然后创建 BigramCollocationFinder 实例。 # 在 nltk.metrics 包中找到的 BigramAssocMeasures 可用于在文本中查找 bigrams from nltk.collocations import BigramCollocationFinder from nltk.corpus import webtext from nltk.metrics import BigramAssocMeasures tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) print words.nbest(BigramAssocMeasures.likelihood_ratio, 10) # 在上面的代码中,我们可以添加一个用来消除停止词和标点符号的单词过滤器 from nltk.corpus import stopwords set1 = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in set1 words.apply_word_filter(stops_filter) print words.nbest(BigramAssocMeasures.likelihood_ratio, 10) # 这里,我们可以将 bigrams 的频率更改为其他数字。 # 另一种从文本中生成 bigrams 的方法是使用词汇搭配查找器,如下代码所示 import nltk text1 = "Hardwork is the key to success. Never give up!" word = nltk.tokenize.wordpunct_tokenize(text1) finder = BigramCollocationFinder.from_words(word) bigram_measures = nltk.collocations.BigramAssocMeasures() value = finder.score_ngrams(bigram_measures.raw_freq) print sorted(bigram for bigram, score in value) # 为了生成 fourgrams 并生成 fourgrams 的频率,可以使用如下代码 text = "Hello how are you doing ? I hope you find the book interesting" tokens = nltk.wordpunct_tokenize(text) fourgrams = nltk.collocations.QuadgramCollocationFinder.from_words(tokens) for fourgram, freq in fourgrams.ngram_fd.items(): print(fourgram, freq)
def __init__(self, config, dont_ask=False): RulesetCommon.__init__(self, config, dont_ask) if config["worddb"]["type"] == "file": self.setWordDbFile(config["worddb"]["name"]) else: from nltk.corpus import alpino self.worddb = set(alpino.words()) self.language = LANGUAGE self.lang_id = self.db.languages[LANGUAGE] self.vowels = ["a", "e", "i", "u", "o"] self.double_chars = [ "oe", "ou", "au", "ij", "ui", "ie", "ei", "eu", "oi", "ai" ]
import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) trigrams_tokens=ngrams(alpino.words(),3) for i in trigrams_tokens: print(i)
from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) unigrams = ngrams(alpino.words(), 4) #四元语法 print(unigrams) # for i in unigrams: # print(i) from nltk.collocations import BigramCollocationFinder from nltk.corpus import webtext from nltk.metrics import BigramAssocMeasures from nltk.corpus import stopwords stop_words = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in stop_words # 单词长度小于3或是停用词 tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) # 创建实例 print(words) words.apply_word_filter(stops_filter) res = words.nbest(BigramAssocMeasures.likelihood_ratio, 5) # 二元语法,前5个 print(res) # 使用词汇搭配查找器生成bigrams import nltk text1 = "Hardwork is the key to success. Never give up!" word = nltk.wordpunct_tokenize(text1) finder = BigramCollocationFinder.from_words(word) bigram_measures = BigramAssocMeasures() value = finder.score_ngrams(bigram_measures.raw_freq) print(sorted(bigram for bigram, score in value))
import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) bigrams_tokens = ngrams(alpino.words(), 2) for i in bigrams_tokens: print(i)
'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'), 'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR='#FFF' #white
import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) bigrams_tokens=ngrams(alpino.words(),2) for i in bigrams_tokens: print(i)
import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) unigrams = ngrams(alpino.words(), 1) for i in unigrams: print(i)
POLL_INTERVAL = 100 _DEFAULT = "English: Brown Corpus (Humor)" _CORPORA = { "Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words(categories=["news", "editorial", "reviews"]), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words(categories="science_fiction"), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk()
import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) quadgrams = ngrams(alpino.words(), 4) for i in quadgrams: print(i)
import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) unigrams=ngrams(alpino.words(),1) for i in unigrams: print(i)
"""统计语言建模""" """计算句子中某种语言模式出现概率的统计模型 把自然语言作为模型进行统计分析""" import nltk from nltk import ngrams, BigramCollocationFinder, BigramAssocMeasures, unique_list, KneserNeyProbDist from nltk.corpus import alpino, webtext, stopwords """单词分组 util.py""" n = 4 grams = ngrams(alpino.words(), n) # for i in grams: # print(i) out = list(ngrams([1, 2, 3, 4, 5], 3)) print(out) # [(1, 2, 3), (2, 3, 4), (3, 4, 5)] set = set(stopwords.words('english')) stops_filter = lambda w: len(w) < 3 or w in set tokens = [t.lower() for t in webtext.words('grail.txt')] words = BigramCollocationFinder.from_words(tokens) words.apply_word_filter(stops_filter) out = words.nbest(BigramAssocMeasures.likelihood_ratio, 10) print(out) """最大似然估计的目的就是:利用已知的样本结果,反推最有可能(最大概率)导致这样结果的参数值。""" """最大似然估计wiki https://zh.wikipedia.org/zh-cn/%E6%9C%80%E5%A4%A7%E4%BC%BC%E7%84%B6%E4%BC%B0%E8%AE%A1""" """隐马尔科夫模型估计 HMM""" corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:700] print(len(corpus)) tag_set = unique_list(tag for sent in corpus for (word, tag) in sent) print(len(tag_set)) """平滑""" # gt = lambda fd, bins:SimpleGoodTuringProbDist(fd, bins=1e5) # train_and_test(gt) corpus = [[((x[0], y[0], z[0]), (x[1], y[1], z[1]))
import nltk from nltk.util import ngrams from nltk.corpus import alpino unigrams = ngrams(alpino.words(), 4) for i in unigrams: print(i) exit()
import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words()) quadgrams=ngrams(alpino.words(),4) for i in quadgrams: print(i)
''' @author:KongWeiKun @file: wordFrequency.py @time: 18-3-31 下午1:17 @contact: [email protected] ''' import nltk from nltk.util import ngrams from nltk.corpus import alpino print(alpino.words())
'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'), 'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR = '#FFF' # white
"Catalan: CESS-CAT Corpus": lambda: cess_cat.words(), "English: Brown Corpus": lambda: brown.words(), "English: Brown Corpus (Press)": lambda: brown.words( categories=["news", "editorial", "reviews"] ), "English: Brown Corpus (Religion)": lambda: brown.words(categories="religion"), "English: Brown Corpus (Learned)": lambda: brown.words(categories="learned"), "English: Brown Corpus (Science Fiction)": lambda: brown.words( categories="science_fiction" ), "English: Brown Corpus (Romance)": lambda: brown.words(categories="romance"), "English: Brown Corpus (Humor)": lambda: brown.words(categories="humor"), "English: NPS Chat Corpus": lambda: nps_chat.words(), "English: Wall Street Journal Corpus": lambda: treebank.words(), "Chinese: Sinica Corpus": lambda: sinica_treebank.words(), "Dutch: Alpino Corpus": lambda: alpino.words(), "Hindi: Indian Languages Corpus": lambda: indian.words(files="hindi.pos"), "Portuguese: Floresta Corpus (Portugal)": lambda: floresta.words(), "Portuguese: MAC-MORPHO Corpus (Brazil)": lambda: mac_morpho.words(), "Portuguese: Machado Corpus (Brazil)": lambda: machado.words(), "Spanish: CESS-ESP Corpus": lambda: cess_esp.words(), } class CollocationsView: _BACKGROUND_COLOUR = "#FFF" # white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk()