def split_all(): show_subtitle("直接基于所有数据进行分割") tagged_sents = list(brown.tagged_sents(categories='news')) random.shuffle(tagged_sents) size = int(len(tagged_sents) * 0.1) train_set, test_set = tagged_sents[size:], tagged_sents[:size] return train_set, test_set
def sec02(): show_subtitle("简化说明符") fmt_str = "Hello, %s. %s enough for ya?" values = ('world', 'Hot') print(fmt_str % values) show_subtitle("模板字符串") from string import Template tmpl = Template("Hello, $who! $what enough for ya?") print(tmpl.substitute(who="Mars", what="Dusty")) pass
print("'\w+(\w)'= ", nltk.regexp_tokenize(text, '\w+(\w)')) # 每个单词,取最后那个字母 print("'\w(-\w)'= ", nltk.regexp_tokenize(text, '\w(-\w)')) print("'\w+(-\w)'= ", nltk.regexp_tokenize(text, '\w+(-\w)')) print("'\w(-\w+)'= ", nltk.regexp_tokenize(text, '\w(-\w+)')) print("'\w+(-\w+)'= ", nltk.regexp_tokenize(text, '\w+(-\w+)')) print("'\w(-\w+)*'= ", nltk.regexp_tokenize(text, '\w(-\w+)*')) print("'\w+(-\w+)*'= ", nltk.regexp_tokenize(text, '\w+(-\w+)*')) print("'\w+(?:)'))= ", nltk.regexp_tokenize(text, '\w+(?:)')) print("'\w+(?:)+'))= ", nltk.regexp_tokenize(text, '\w+(?:)+')) print("'\w+(?:\w)'))= ", nltk.regexp_tokenize(text, '\w+(?:\w)')) print("'\w+(?:\w+)'))= ", nltk.regexp_tokenize(text, '\w+(?:\w+)')) print("'\w+(?:\w)*'))= ", nltk.regexp_tokenize(text, '\w+(?:\w)*')) print("'\w+(?:\w+)*'))= ", nltk.regexp_tokenize(text, '\w+(?:\w+)*')) print("'\.\.\.'= ", nltk.regexp_tokenize(text, '\.\.\.')) print("'\.\.\.|([A-Z]\.)+'= ", nltk.regexp_tokenize(text, '\.\.\.|([A-Z]\.)+')) # (?:) 非捕捉组用法对比 inputStr = "hello 123 world 456 nihao 789" rePatternAllCapturingGroup = "\w+ (\d+) \w+ (\d+) \w+ (\d+)" rePatternWithNonCapturingGroup = "\w+ (\d+) \w+ (?:\d+) \w+ (\d+)" show_subtitle(rePatternAllCapturingGroup) nltk.regexp_tokenize(inputStr, rePatternAllCapturingGroup) show_subtitle(rePatternWithNonCapturingGroup) nltk.regexp_tokenize(inputStr, rePatternWithNonCapturingGroup) # 3.7.3 进一步讨论分词 # 分词:比预期更为艰巨,没有任何单一的解决方案可以在所有领域都行之有效。 # 在开发分词器时,访问已经手工飘游好的原始文本则理有好处,可以将分词器的输出结果与高品质(也叫「黄金标准」)的标注进行比较。
unigram_tagger.evaluate(brown_tagged_sents) # 5.5.2 将数据分为 训练集 和 测试集 # 使用训练数据来训练一元标注器,使用测试数据来评估一元标注器的准确度 size = int(len(brown_tagged_sents) * 0.9) train_sents = brown_tagged_sents[:size] test_sents = brown_tagged_sents[size:] unigram_tagger = nltk.UnigramTagger(train_sents) unigram_tagger.evaluate(test_sents) # 5.5.3 更加一般的N元标注器 # 二元标注器 bigram_tagger = nltk.BigramTagger(train_sents) # 标注训练集中数据 show_subtitle("bigram_tagger.tag(train_sents[2007])") print(bigram_tagger.tag(brown_sents[2007])) # 标注测试集中数据 show_subtitle("bigram_tagger.tag(brown_sents[4203])") print(bigram_tagger.tag(brown_sents[4203])) bigram_tagger.evaluate(test_sents) # 整体准确度很低,是因为数据稀疏问题 # 5.5.4 组合标注器,效果更差,为什么? t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(train_sents, backoff=t0) t1.evaluate(test_sents) t2 = nltk.BigramTagger(train_sents, backoff=t1) t2.evaluate(test_sents) # 这个效果最好
groucho_grammar = nltk.CFG.fromstring(''' S -> NP VP PP -> P NP NP -> Det N | Det N PP | "I" VP -> V NP | VP PP Det -> "an" | "my" N -> "elephant" | "pajamas" V -> "shot" P -> "in" ''') # 基于一种文法解析句子,可能会解析出两种结构 sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas'] parser = nltk.ChartParser(groucho_grammar) # 图解析 for i, tree in enumerate(parser.parse(sent)): show_subtitle(f"第 {i + 1} 个结构") print(tree) # 8.2 文法的用途 # 8.2.1 超越 n-grams # 成分结构是词与词结合在一起组成的单元。 # 在符合语法规则的句子中的词序列是可以被一个更小的词序列替代,并且这个词序列不会导致句子不合语法规则 # 形成单元的每个序列都可以被单独的词替换。 # 句子长度是任意的,因此短语结构树的深度也是任意的。因为Sec7.4只能产生有限深度的结构,所以分块方法并不适合用于句法分析。 # 8.3 上下文无关文法(context-free grammars,CFG) # 8.3.1 一种简单的文法 # Ex8-1 一个简单的上下文无关文法的例子 grammar1 = nltk.CFG.fromstring(""" S -> NP VP
# 10.1.1 查询数据库 # 意思表示的理念和技术框架 # sql语法的缺陷: # 1)某些语法公式不符合语法的规则 # 2)把数据库的细节加入了语法公式中 nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg') from nltk import load_parser cp = load_parser('grammars/book_grammars/sql0.fcfg') # cp = load_parser('grammars/book_grammars/sql0.fcfg',trace = 3) query = 'What cities are located in China' trees = list(cp.parse(query.split())) answer = trees[0].label()['SEM'] query_sql = ' '.join(answer) show_subtitle("query") print(query) show_subtitle("trees") for tree in trees: print(tree) show_subtitle("answer") print(answer) show_subtitle("query_sql") print(query_sql) # NLTK Semantic Interpretation Package, SEM = Semantic from nltk.sem import chat80 rows = chat80.sql_query('corpora/city_database/city.db', query_sql) for r in rows: print(r[0], end=" ")
''' print("sent.split()= ", sent.split()) split_sent_to_tuple = [nltk.tag.str2tuple(t) for t in sent.split()] print("split_sent_to_tuple= ", split_sent_to_tuple) # 5.2.2 读取已经标注的语料库 # 打开brown语料库的ca01文件,可以看到下面的内容: # The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at # investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd # ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./. # 这个是已经标注好的语料库,可以使用函数tagged_words()直接读取 # tagset='universal' 使用通用标注集进行词类标注 # simplify_tags 不再支持 show_subtitle("brown.tagged_words()") print(brown.tagged_words()) show_subtitle("brown.tagged_words(tagset='universal')") print(brown.tagged_words(tagset='universal')) show_subtitle("nltk.corpus.treebank.tagged_words()") print(nltk.corpus.treebank.tagged_words()) show_subtitle("nltk.corpus.treebank.tagged_words(tagset='universal')") print(nltk.corpus.treebank.tagged_words(tagset='universal')) show_subtitle("nltk.corpus.nps_chat.tagged_words()") print(nltk.corpus.nps_chat.tagged_words()) show_subtitle("nltk.corpus.nps_chat.tagged_words(tagset='universal')") print(nltk.corpus.nps_chat.tagged_words(tagset='universal')) show_subtitle("nltk.corpus.conll2000.tagged_words()")
saying = [ 'After', 'all', 'is', 'said', 'and', 'done', ',', 'more', 'is', 'said', 'than', 'done', '.' ] for word in saying: print(word, '(' + str(len(word)) + ')', end=' ') # 文本显示时自动换行 from textwrap import fill format = '%s_(%d)' pieces = [format % (word, len(word)) for word in saying] output = ', '.join(pieces) wrapped = fill(output) # 自动换行显示 show_subtitle(format) print(wrapped) format = '{}_({})' pieces = [f'{word}_({len(word)})' for word in saying] output = ', '.join(pieces) wrapped = fill(output) # 自动换行显示 show_subtitle(format) print(wrapped) # 3.10 小结 # - 字符串中的字符是使用索引来访问的,索引从零开始计数(`str[0]`) # - 子字符串使用切片符号访问(`str[3:5]`) # - 字符串可以被分割成链表(`str.split()`);链表还可以连接成字符串`''.join(list)`。 # - 文本可以从文件中读取,也可以从URL地址中读取。 # - 分词是将文本分割成基本单位或者标记,例如:词和标点符号等。基于空格符的分词无法满足应用需要。
# 分割数据集为(训练集+测试集) text = nltk.corpus.nps_chat.words() cut = int(0.9 * len(text)) training_data, test_data = text[:cut], text[cut:] print("len(training_data) / len(test_data)= ", len(training_data) / len(test_data)) # 使用split()函数分词 raw = 'I turned off the spectroroute' words = raw.split() print("words= ", words) wordlens = [(len(word), word) for word in words] print("wordlens= ", wordlens) wordlens.sort() show_subtitle("wordlens.sort()") print("wordlens= ", wordlens) wordlens.reverse() show_subtitle("wordlens.reverse()") print("wordlens= ", wordlens) wordlens.pop() show_subtitle("wordlens.pop()") print("wordlens= ", wordlens) print("' '.join(w for (_, w) in wordlens)= ", ' '.join(w for (_, w) in wordlens)) # 元组是不可修改的,而链表是可以修改的 lexicon = [('the', 'det', ['Di:', 'D@']), ('off', 'prep', ['Qf', 'O:f'])]
# RTEFeatureExtractor类建立了一个词汇包 # 这个词汇包在文本和假设中都有的,并且已经除去了一些停用词 extractor = nltk.RTEFeatureExtractor(rtepair) features = {} # 计算 重叠性 和 差异性 features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) return features # 取出文本-假设对的数据 rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] extractor = nltk.RTEFeatureExtractor(rtepair) show_subtitle("文本中的单词") print(extractor.text_words) show_subtitle("假设中的单词") print(extractor.hyp_words) show_subtitle("文本和假设中重叠的单词(非实体词)") print(extractor.overlap('word')) show_subtitle("文本和假设中重叠的实体词") print(extractor.overlap('ne')) show_subtitle("文本和假设中差异的单词(非实体词)") print(extractor.hyp_extra('word')) show_subtitle("文本和假设中差异的实体词") print(extractor.hyp_extra('ne')) # 2.4 扩展到大型的数据集 # NLTK提供对专业的机器学习软件包的支持,调用它们会比NLTK提供的分类器性能更好
def split_categories(): show_subtitle("基于文章类型进行数据分割") train_set, test_set = brown.tagged_sents(categories='news'), brown.tagged_sents(categories='fiction') return train_set, test_set
def split_file_ids(): show_subtitle("基于文章名称进行数据分割") file_ids = brown.fileids(categories='news') size = int(len(file_ids) * 0.1) train_set, test_set = brown.tagged_sents(file_ids[size:]), brown.tagged_sents(file_ids[:size]) return train_set, test_set
def sec03(): # 如何设置格式的信息使用一种微型格式指定语言(mini-language)。 # 每个值都被插入到字符串中,以替换用花括号括起来的替换字段。 # 使用两个 花括号 可以插入花括号 print("{{ceci n'est pas une replacement field}}".format()) # 替换字段名 print("{foo} {} {bar} {}".format(1, 2, bar=4, foo=3)) print("{foo} {1} {bar} {0}".format(1, 2, bar=4, foo=3)) # 导入变量 fullname = ['Alfred', 'Smoketoomuch'] print("Mr {name[1]}".format(name=fullname)) # 导入模块 import math tmpl = "The {mod.__name__} module defines the value {mod.pi} for π" print(tmpl.format(mod=math)) show_subtitle("表3-1 字符串格式设置中的类型说明符") # 转换标志:s 代表 str;r 代表 repr;a 代表 ascii # 函数 str 创建外观普通的字符串版本 # 函数 repr 创建给定值的 Python 表示 # 函数 ascii 创建只包含 ASCII 字符的表示 print("{pi!s} {pi!r} {pi!a}".format(pi="π")) # 字符 f :表示定点数;字符 b : 表示二进制 print("The number is {num}".format(num=42)) print("The number is {num:f}".format(num=42)) print("The number is {num:b}".format(num=42)) # 设置显示内容的宽度 print("-->{num:10} {num:10}<--".format(num=3)) print("-->{name:10} {name:10}<--".format(name="Bob")) # 设置显示内容的精度 print("0123456789012345678901234567890123456789") print("Pi day is {pi:.2f}".format(pi=pi)) print("Pi day is {pi:10.2f}".format(pi=pi)) # 对字符串指定精度 print("{:.5}".format("Guido van Rossum")) # 使用逗号来添加 千位分隔符 print("One googol iss {:,}".format(10**100)) # 使用 0 填充 show_subtitle("填充") print("{:010.2f}".format(pi)) print("{:010.2f}".format(-pi)) print("{:+10.2f}".format(pi)) print("{:+10.2f}".format(-pi)) print("{:-10.2f}".format(pi)) print("{:-10.2f}".format(-pi)) print("{: 10.2f}".format(pi)) # 指定:左对齐(<)、右对齐(>)、居中(^) show_subtitle("对齐") print("{0:<010.2f}\n{0:^010.2f}\n{0:>010.2f}".format(pi)) print("{:$^15}".format(" WIN BIG ")) # 说明符(=):将填充字符放在符号和数字之间 print("{0:10.2f}\t{1:10.2f}".format(pi, -pi)) print("{0:10.2f}\t{1:=10.2f}".format(pi, -pi)) # 说明符:-、+、空格 print("{0:.2}\t{1:.2}".format(pi, -pi)) print("{0:-.2}\t{1:-.2}".format(pi, -pi)) # 默认设置 print("{0:+.2}\t{1:+.2}".format(pi, -pi)) print("{0: .2}\t{1: .2}".format(pi, -pi)) # 井号(#):为数字进制转换加上前缀 print("{:b}".format(42)) print("{:#b}".format(42)) print("{:o}".format(42)) print("{:#o}".format(42)) print("{:x}".format(42)) print("{:#x}".format(42)) # 对于十进制数,保留小数点 print("{:g}".format(0x2a)) print("{:#g}".format(0x2a))
nltk.pos_tag(text) nltk.help.upenn_tagset('CC') nltk.help.upenn_tagset('RB') nltk.help.upenn_tagset('IN') nltk.help.upenn_tagset('NN') nltk.help.upenn_tagset('JJ') nltk.corpus.brown.readme() print(nltk.corpus.gutenberg.readme()) # 处理同形同音异义词,系统正确标注了 # 前面的refUSE是动词,后面的REFuse是名词 # 前面的permit是动词,后面的permit是名字 text = word_tokenize("They refuse to permit us to obtain the refuse permit") nltk.pos_tag(text) text = word_tokenize("They refuse to permit us to obtain the beautiful book") nltk.pos_tag(text) # 找出形如w1 w w2的上下文,然后再找出所有出现在相同上下文的词 w',即w1 w' w2 # 用于寻找相似的单词,因为这些单词处于相同的上下文中 text = nltk.Text(word.lower() for word in nltk.corpus.brown.words()) show_subtitle("text.similar('word')") text.similar('word') show_subtitle("text.similar('woman')") text.similar('woman') show_subtitle("text.similar('bought')") text.similar('bought') show_subtitle("text.similar('over')") text.similar('over') show_subtitle("text.similar('the')") text.similar('the')