-
Notifications
You must be signed in to change notification settings - Fork 2
/
create_vocab.py
115 lines (105 loc) · 3.66 KB
/
create_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# coding=utf-8
# ┏┓ ┏┓+ +
# ┏┛┻━━━┛┻┓ + +
# ┃ ┃
# ┃ ━ ┃ ++ + + +
# ████━████ ┃+
# ┃ ┃ +
# ┃ ┻ ┃
# ┃ ┃ + +
# ┗━┓ ┏━┛
# ┃ ┃
# ┃ ┃ + + + +
# ┃ ┃ Codes are far away from bugs with the animal protecting
# ┃ ┃ + 神兽保佑,代码无bug
# ┃ ┃
# ┃ ┃ +
# ┃ ┗━━━┓ + +
# ┃ ┣┓
# ┃ ┏┛
# ┗┓┓┏━┳┓┏┛ + + + +
# ┃┫┫ ┃┫┫
# ┗┻┛ ┗┻┛+ + + +
"""
Author = Eric_Chan
Create_Time = 2016/05/29
构建词库
"""
import jieba
import sys
jieba.initialize() # 手动启动jieba模块
def load_file(file_name, charset='utf-8'):
"""
读取文件,按列返回列表
:param file_name: 文件路径
:param charset: 文本内容decode的编码,默认为utf-8
:return: 文本内容列表
"""
f1 = open(file_name)
line = f1.readline().decode(charset).strip()
line_list = []
while line:
line = line.strip()
if line:
line_list.append(line)
line = f1.readline().decode(charset)
else:
line = f1.readline().decode(charset)
return line_list
def cut_sentence(sentence_list):
"""
对句子列表进行分词并除去停用词
:param sentence_list: 待分词的句子列表
:return: 每个句子的已分词后的二维列表
"""
sentence_cut_list = []
for sentence in sentence_list:
sentence_cut_list.append(jieba.cut(sentence))
return sentence_cut_list
def write_file(file_name, line_list, charset='utf-8', mode='w'):
"""
新建文件将line_list中每个元素按行写入
:param mode: 打开文件的规格, 'w' 表示新建, 'a' 表示添加, 'r' 表示只读
:param file_name: 新建文件的文件名和路径
:param line_list: 写入文件的列表
:param charset: 写入文件是encode的编码, 默认为utf-8
:return: void
"""
f1 = open(file_name, mode=mode)
for line in line_list:
line = line.encode(charset)
f1.write(line + '\n')
f1.flush()
f1.close()
def create_vocab_list(sentence_cut_list):
"""
构建词库
:param sentence_cut_list: 已分词的二维列表
:return:
"""
print "creating vocab..."
t = 1.
total_num = len(sentence_cut_list)
vocab_set = set([])
for sentence_cut in sentence_cut_list:
for word in sentence_cut:
if word not in vocab_set:
vocab_set.add(word)
# vocab_set = vocab_set | set(sentence_cut) # 消重取并集
sys.stdout.write('\r %5.2f%%' % (t / total_num * 100))
sys.stdout.flush()
t += 1
sys.stdout.write('\r')
print "creating vocab done"
return list(vocab_set)
if __name__ == '__main__':
import time
t1 = time.time()
data = load_file('dataSet/note_training.txt')
messages = [i.split('\t')[2] for i in data]
messages_cut = cut_sentence(messages)
vocab = create_vocab_list(messages_cut)
vocab.sort()
write_file('dataSet/vocab.txt', vocab)
t2 = time.time()
print "\n finished use time: %5.2fs" % (t2-t1)