示例#1
0
def split_all():
    show_subtitle("直接基于所有数据进行分割")
    tagged_sents = list(brown.tagged_sents(categories='news'))
    random.shuffle(tagged_sents)
    size = int(len(tagged_sents) * 0.1)
    train_set, test_set = tagged_sents[size:], tagged_sents[:size]
    return train_set, test_set
示例#2
0
def sec02():
    show_subtitle("简化说明符")
    fmt_str = "Hello, %s. %s enough for ya?"
    values = ('world', 'Hot')
    print(fmt_str % values)
    show_subtitle("模板字符串")
    from string import Template
    tmpl = Template("Hello, $who! $what enough for ya?")
    print(tmpl.substitute(who="Mars", what="Dusty"))
    pass
示例#3
0
print("'\w+(\w)'= ", nltk.regexp_tokenize(text, '\w+(\w)'))  # 每个单词,取最后那个字母
print("'\w(-\w)'= ", nltk.regexp_tokenize(text, '\w(-\w)'))
print("'\w+(-\w)'= ", nltk.regexp_tokenize(text, '\w+(-\w)'))
print("'\w(-\w+)'= ", nltk.regexp_tokenize(text, '\w(-\w+)'))
print("'\w+(-\w+)'= ", nltk.regexp_tokenize(text, '\w+(-\w+)'))
print("'\w(-\w+)*'= ", nltk.regexp_tokenize(text, '\w(-\w+)*'))
print("'\w+(-\w+)*'= ", nltk.regexp_tokenize(text, '\w+(-\w+)*'))

print("'\w+(?:)'))= ", nltk.regexp_tokenize(text, '\w+(?:)'))
print("'\w+(?:)+'))= ", nltk.regexp_tokenize(text, '\w+(?:)+'))
print("'\w+(?:\w)'))= ", nltk.regexp_tokenize(text, '\w+(?:\w)'))
print("'\w+(?:\w+)'))= ", nltk.regexp_tokenize(text, '\w+(?:\w+)'))
print("'\w+(?:\w)*'))= ", nltk.regexp_tokenize(text, '\w+(?:\w)*'))
print("'\w+(?:\w+)*'))= ", nltk.regexp_tokenize(text, '\w+(?:\w+)*'))

print("'\.\.\.'= ", nltk.regexp_tokenize(text, '\.\.\.'))
print("'\.\.\.|([A-Z]\.)+'= ", nltk.regexp_tokenize(text, '\.\.\.|([A-Z]\.)+'))

# (?:) 非捕捉组用法对比
inputStr = "hello 123 world 456 nihao 789"
rePatternAllCapturingGroup = "\w+ (\d+) \w+ (\d+) \w+ (\d+)"
rePatternWithNonCapturingGroup = "\w+ (\d+) \w+ (?:\d+) \w+ (\d+)"
show_subtitle(rePatternAllCapturingGroup)
nltk.regexp_tokenize(inputStr, rePatternAllCapturingGroup)
show_subtitle(rePatternWithNonCapturingGroup)
nltk.regexp_tokenize(inputStr, rePatternWithNonCapturingGroup)

# 3.7.3 进一步讨论分词
# 分词:比预期更为艰巨,没有任何单一的解决方案可以在所有领域都行之有效。
# 在开发分词器时,访问已经手工飘游好的原始文本则理有好处,可以将分词器的输出结果与高品质(也叫「黄金标准」)的标注进行比较。
示例#4
0
unigram_tagger.evaluate(brown_tagged_sents)

# 5.5.2 将数据分为 训练集 和 测试集
# 使用训练数据来训练一元标注器,使用测试数据来评估一元标注器的准确度
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)

# 5.5.3 更加一般的N元标注器
# 二元标注器
bigram_tagger = nltk.BigramTagger(train_sents)

# 标注训练集中数据
show_subtitle("bigram_tagger.tag(train_sents[2007])")
print(bigram_tagger.tag(brown_sents[2007]))

# 标注测试集中数据
show_subtitle("bigram_tagger.tag(brown_sents[4203])")
print(bigram_tagger.tag(brown_sents[4203]))

bigram_tagger.evaluate(test_sents)  # 整体准确度很低,是因为数据稀疏问题

# 5.5.4 组合标注器,效果更差,为什么?
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t1.evaluate(test_sents)

t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)  # 这个效果最好
示例#5
0
groucho_grammar = nltk.CFG.fromstring('''
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | "I"
VP -> V NP | VP PP
Det -> "an" | "my"
N -> "elephant" | "pajamas"
V -> "shot"
P -> "in"
''')

# 基于一种文法解析句子,可能会解析出两种结构
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = nltk.ChartParser(groucho_grammar)  # 图解析
for i, tree in enumerate(parser.parse(sent)):
    show_subtitle(f"第 {i + 1} 个结构")
    print(tree)

# 8.2 文法的用途
# 8.2.1 超越 n-grams
# 成分结构是词与词结合在一起组成的单元。
# 在符合语法规则的句子中的词序列是可以被一个更小的词序列替代,并且这个词序列不会导致句子不合语法规则
# 形成单元的每个序列都可以被单独的词替换。
# 句子长度是任意的,因此短语结构树的深度也是任意的。因为Sec7.4只能产生有限深度的结构,所以分块方法并不适合用于句法分析。

# 8.3 上下文无关文法(context-free grammars,CFG)
# 8.3.1 一种简单的文法

# Ex8-1 一个简单的上下文无关文法的例子
grammar1 = nltk.CFG.fromstring("""
S -> NP VP
示例#6
0
# 10.1.1 查询数据库
# 意思表示的理念和技术框架
# sql语法的缺陷:
# 1)某些语法公式不符合语法的规则
# 2)把数据库的细节加入了语法公式中
nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg')

from nltk import load_parser

cp = load_parser('grammars/book_grammars/sql0.fcfg')
# cp = load_parser('grammars/book_grammars/sql0.fcfg',trace = 3)
query = 'What cities are located in China'
trees = list(cp.parse(query.split()))
answer = trees[0].label()['SEM']
query_sql = ' '.join(answer)
show_subtitle("query")
print(query)
show_subtitle("trees")
for tree in trees:
    print(tree)
show_subtitle("answer")
print(answer)
show_subtitle("query_sql")
print(query_sql)

# NLTK Semantic Interpretation Package, SEM = Semantic
from nltk.sem import chat80

rows = chat80.sql_query('corpora/city_database/city.db', query_sql)
for r in rows:
    print(r[0], end=" ")
示例#7
0
'''

print("sent.split()= ", sent.split())
split_sent_to_tuple = [nltk.tag.str2tuple(t) for t in sent.split()]
print("split_sent_to_tuple= ", split_sent_to_tuple)

# 5.2.2 读取已经标注的语料库
# 打开brown语料库的ca01文件,可以看到下面的内容:
# The/at Fulton/np-tl County/nn-tl Grand/jj-tl Jury/nn-tl said/vbd Friday/nr an/at
# investigation/nn of/in Atlanta's/np$ recent/jj primary/nn election/nn produced/vbd
# ``/`` no/at evidence/nn ''/'' that/cs any/dti irregularities/nns took/vbd place/nn ./.
# 这个是已经标注好的语料库,可以使用函数tagged_words()直接读取

# tagset='universal' 使用通用标注集进行词类标注
# simplify_tags 不再支持
show_subtitle("brown.tagged_words()")
print(brown.tagged_words())
show_subtitle("brown.tagged_words(tagset='universal')")
print(brown.tagged_words(tagset='universal'))

show_subtitle("nltk.corpus.treebank.tagged_words()")
print(nltk.corpus.treebank.tagged_words())
show_subtitle("nltk.corpus.treebank.tagged_words(tagset='universal')")
print(nltk.corpus.treebank.tagged_words(tagset='universal'))

show_subtitle("nltk.corpus.nps_chat.tagged_words()")
print(nltk.corpus.nps_chat.tagged_words())
show_subtitle("nltk.corpus.nps_chat.tagged_words(tagset='universal')")
print(nltk.corpus.nps_chat.tagged_words(tagset='universal'))

show_subtitle("nltk.corpus.conll2000.tagged_words()")
示例#8
0
saying = [
    'After', 'all', 'is', 'said', 'and', 'done', ',', 'more', 'is', 'said',
    'than', 'done', '.'
]

for word in saying:
    print(word, '(' + str(len(word)) + ')', end=' ')

# 文本显示时自动换行
from textwrap import fill

format = '%s_(%d)'
pieces = [format % (word, len(word)) for word in saying]
output = ', '.join(pieces)
wrapped = fill(output)  # 自动换行显示
show_subtitle(format)
print(wrapped)

format = '{}_({})'
pieces = [f'{word}_({len(word)})' for word in saying]
output = ', '.join(pieces)
wrapped = fill(output)  # 自动换行显示
show_subtitle(format)
print(wrapped)

# 3.10 小结
# -   字符串中的字符是使用索引来访问的,索引从零开始计数(`str[0]`)
# -   子字符串使用切片符号访问(`str[3:5]`)
# -   字符串可以被分割成链表(`str.split()`);链表还可以连接成字符串`''.join(list)`。
# -   文本可以从文件中读取,也可以从URL地址中读取。
# -   分词是将文本分割成基本单位或者标记,例如:词和标点符号等。基于空格符的分词无法满足应用需要。
示例#9
0
# 分割数据集为(训练集+测试集)
text = nltk.corpus.nps_chat.words()
cut = int(0.9 * len(text))
training_data, test_data = text[:cut], text[cut:]
print("len(training_data) / len(test_data)= ",
      len(training_data) / len(test_data))

# 使用split()函数分词
raw = 'I turned off the spectroroute'
words = raw.split()
print("words= ", words)
wordlens = [(len(word), word) for word in words]
print("wordlens= ", wordlens)

wordlens.sort()
show_subtitle("wordlens.sort()")
print("wordlens= ", wordlens)

wordlens.reverse()
show_subtitle("wordlens.reverse()")
print("wordlens= ", wordlens)

wordlens.pop()
show_subtitle("wordlens.pop()")
print("wordlens= ", wordlens)

print("' '.join(w for (_, w) in wordlens)= ",
      ' '.join(w for (_, w) in wordlens))

# 元组是不可修改的,而链表是可以修改的
lexicon = [('the', 'det', ['Di:', 'D@']), ('off', 'prep', ['Qf', 'O:f'])]
示例#10
0
    # RTEFeatureExtractor类建立了一个词汇包
    # 这个词汇包在文本和假设中都有的,并且已经除去了一些停用词
    extractor = nltk.RTEFeatureExtractor(rtepair)
    features = {}
    # 计算 重叠性 和 差异性
    features['word_overlap'] = len(extractor.overlap('word'))
    features['word_hyp_extra'] = len(extractor.hyp_extra('word'))
    features['ne_overlap'] = len(extractor.overlap('ne'))
    features['ne_hyp_extra'] = len(extractor.hyp_extra('ne'))
    return features


# 取出文本-假设对的数据
rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33]
extractor = nltk.RTEFeatureExtractor(rtepair)
show_subtitle("文本中的单词")
print(extractor.text_words)
show_subtitle("假设中的单词")
print(extractor.hyp_words)
show_subtitle("文本和假设中重叠的单词(非实体词)")
print(extractor.overlap('word'))
show_subtitle("文本和假设中重叠的实体词")
print(extractor.overlap('ne'))
show_subtitle("文本和假设中差异的单词(非实体词)")
print(extractor.hyp_extra('word'))
show_subtitle("文本和假设中差异的实体词")
print(extractor.hyp_extra('ne'))

# 2.4 扩展到大型的数据集
# NLTK提供对专业的机器学习软件包的支持,调用它们会比NLTK提供的分类器性能更好
示例#11
0
def split_categories():
    show_subtitle("基于文章类型进行数据分割")
    train_set, test_set = brown.tagged_sents(categories='news'), brown.tagged_sents(categories='fiction')
    return train_set, test_set
示例#12
0
def split_file_ids():
    show_subtitle("基于文章名称进行数据分割")
    file_ids = brown.fileids(categories='news')
    size = int(len(file_ids) * 0.1)
    train_set, test_set = brown.tagged_sents(file_ids[size:]), brown.tagged_sents(file_ids[:size])
    return train_set, test_set
示例#13
0
def sec03():
    # 如何设置格式的信息使用一种微型格式指定语言(mini-language)。
    # 每个值都被插入到字符串中,以替换用花括号括起来的替换字段。
    # 使用两个 花括号 可以插入花括号
    print("{{ceci n'est pas une replacement field}}".format())
    # 替换字段名
    print("{foo} {} {bar} {}".format(1, 2, bar=4, foo=3))
    print("{foo} {1} {bar} {0}".format(1, 2, bar=4, foo=3))
    # 导入变量
    fullname = ['Alfred', 'Smoketoomuch']
    print("Mr {name[1]}".format(name=fullname))
    # 导入模块
    import math
    tmpl = "The {mod.__name__} module defines the value {mod.pi} for π"
    print(tmpl.format(mod=math))
    show_subtitle("表3-1 字符串格式设置中的类型说明符")
    # 转换标志:s 代表 str;r 代表 repr;a 代表 ascii
    # 函数 str 创建外观普通的字符串版本
    # 函数 repr 创建给定值的 Python 表示
    # 函数 ascii 创建只包含 ASCII 字符的表示
    print("{pi!s} {pi!r} {pi!a}".format(pi="π"))
    # 字符 f :表示定点数;字符 b : 表示二进制
    print("The number is {num}".format(num=42))
    print("The number is {num:f}".format(num=42))
    print("The number is {num:b}".format(num=42))
    # 设置显示内容的宽度
    print("-->{num:10} {num:10}<--".format(num=3))
    print("-->{name:10} {name:10}<--".format(name="Bob"))
    # 设置显示内容的精度
    print("0123456789012345678901234567890123456789")
    print("Pi day is {pi:.2f}".format(pi=pi))
    print("Pi day is {pi:10.2f}".format(pi=pi))
    # 对字符串指定精度
    print("{:.5}".format("Guido van Rossum"))
    # 使用逗号来添加 千位分隔符
    print("One googol iss {:,}".format(10**100))
    # 使用 0 填充
    show_subtitle("填充")
    print("{:010.2f}".format(pi))
    print("{:010.2f}".format(-pi))
    print("{:+10.2f}".format(pi))
    print("{:+10.2f}".format(-pi))
    print("{:-10.2f}".format(pi))
    print("{:-10.2f}".format(-pi))
    print("{: 10.2f}".format(pi))
    # 指定:左对齐(<)、右对齐(>)、居中(^)
    show_subtitle("对齐")
    print("{0:<010.2f}\n{0:^010.2f}\n{0:>010.2f}".format(pi))
    print("{:$^15}".format(" WIN BIG "))
    # 说明符(=):将填充字符放在符号和数字之间
    print("{0:10.2f}\t{1:10.2f}".format(pi, -pi))
    print("{0:10.2f}\t{1:=10.2f}".format(pi, -pi))
    # 说明符:-、+、空格
    print("{0:.2}\t{1:.2}".format(pi, -pi))
    print("{0:-.2}\t{1:-.2}".format(pi, -pi))  # 默认设置
    print("{0:+.2}\t{1:+.2}".format(pi, -pi))
    print("{0: .2}\t{1: .2}".format(pi, -pi))
    # 井号(#):为数字进制转换加上前缀
    print("{:b}".format(42))
    print("{:#b}".format(42))
    print("{:o}".format(42))
    print("{:#o}".format(42))
    print("{:x}".format(42))
    print("{:#x}".format(42))
    # 对于十进制数,保留小数点
    print("{:g}".format(0x2a))
    print("{:#g}".format(0x2a))
示例#14
0
nltk.pos_tag(text)
nltk.help.upenn_tagset('CC')
nltk.help.upenn_tagset('RB')
nltk.help.upenn_tagset('IN')
nltk.help.upenn_tagset('NN')
nltk.help.upenn_tagset('JJ')
nltk.corpus.brown.readme()
print(nltk.corpus.gutenberg.readme())

# 处理同形同音异义词,系统正确标注了
# 前面的refUSE是动词,后面的REFuse是名词
# 前面的permit是动词,后面的permit是名字
text = word_tokenize("They refuse to permit us to obtain the refuse permit")
nltk.pos_tag(text)
text = word_tokenize("They refuse to permit us to obtain the beautiful book")
nltk.pos_tag(text)

# 找出形如w1 w w2的上下文,然后再找出所有出现在相同上下文的词 w',即w1 w' w2
# 用于寻找相似的单词,因为这些单词处于相同的上下文中
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
show_subtitle("text.similar('word')")
text.similar('word')
show_subtitle("text.similar('woman')")
text.similar('woman')
show_subtitle("text.similar('bought')")
text.similar('bought')
show_subtitle("text.similar('over')")
text.similar('over')
show_subtitle("text.similar('the')")
text.similar('the')